feat(api): support asImage=true on screen and zoom
All checks were successful
python-syntax / syntax-check (push) Successful in 4s

This commit is contained in:
2026-04-05 20:03:32 +02:00
parent 4aa51e2d69
commit 683f2d5961
3 changed files with 28 additions and 8 deletions

View File

@@ -4,8 +4,8 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
## What this provides ## What this provides
- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells - **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes)
- **Zoom endpoint**: crop around a point with denser grid for fine targeting - **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported)
- **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey - **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction

View File

@@ -22,8 +22,9 @@ Query params:
- `include_labels` (bool, default `true`) - `include_labels` (bool, default `true`)
- `image_format` (`png`|`jpeg`, default `png`) - `image_format` (`png`|`jpeg`, default `png`)
- `jpeg_quality` (1-100, default `85`) - `jpeg_quality` (1-100, default `85`)
- `asImage` (bool, default `false`) — if `true`, return raw image bytes only (`image/png` or `image/jpeg`)
Response includes base64 image and metadata (`meta.region`, optional `meta.grid`). Default response includes base64 image and metadata (`meta.region`, optional `meta.grid`).
## `POST /zoom` ## `POST /zoom`
@@ -44,7 +45,11 @@ Body:
} }
``` ```
Returns cropped image + region metadata in global pixel coordinates. Query params:
- `asImage` (bool, default `false`) — if `true`, return raw image bytes only (`image/png` or `image/jpeg`)
Default response returns cropped image + region metadata in global pixel coordinates.
## `POST /action` ## `POST /action`

View File

@@ -5,7 +5,7 @@ import time
import uuid import uuid
from typing import Literal, Optional from typing import Literal, Optional
from fastapi import Depends, FastAPI, Header, HTTPException from fastapi import Depends, FastAPI, Header, HTTPException, Response
from pydantic import BaseModel, Field, model_validator from pydantic import BaseModel, Field, model_validator
@@ -160,13 +160,17 @@ def _capture_screen():
return image, {"x": mon["left"], "y": mon["top"], "width": mon["width"], "height": mon["height"]} return image, {"x": mon["left"], "y": mon["top"], "width": mon["width"], "height": mon["height"]}
def _encode_image(image, image_format: str, jpeg_quality: int) -> str: def _serialize_image(image, image_format: str, jpeg_quality: int) -> bytes:
buf = io.BytesIO() buf = io.BytesIO()
if image_format == "jpeg": if image_format == "jpeg":
image.save(buf, format="JPEG", quality=jpeg_quality) image.save(buf, format="JPEG", quality=jpeg_quality)
else: else:
image.save(buf, format="PNG") image.save(buf, format="PNG")
return base64.b64encode(buf.getvalue()).decode("ascii") return buf.getvalue()
def _encode_image(image, image_format: str, jpeg_quality: int) -> str:
return base64.b64encode(_serialize_image(image, image_format, jpeg_quality)).decode("ascii")
def _draw_grid(image, region_x: int, region_y: int, rows: int, cols: int, include_labels: bool): def _draw_grid(image, region_x: int, region_y: int, rows: int, cols: int, include_labels: bool):
@@ -335,6 +339,7 @@ def screen(
include_labels: bool = True, include_labels: bool = True,
image_format: Literal["png", "jpeg"] = "png", image_format: Literal["png", "jpeg"] = "png",
jpeg_quality: int = 85, jpeg_quality: int = 85,
asImage: bool = False,
_: None = Depends(_auth), _: None = Depends(_auth),
): ):
req = ScreenRequest( req = ScreenRequest(
@@ -354,6 +359,11 @@ def screen(
out_img, grid_meta = _draw_grid(base_img, mon["x"], mon["y"], req.grid_rows, req.grid_cols, req.include_labels) out_img, grid_meta = _draw_grid(base_img, mon["x"], mon["y"], req.grid_rows, req.grid_cols, req.include_labels)
meta.update(grid_meta) meta.update(grid_meta)
if asImage:
image_bytes = _serialize_image(out_img, req.image_format, req.jpeg_quality)
media_type = "image/jpeg" if req.image_format == "jpeg" else "image/png"
return Response(content=image_bytes, media_type=media_type)
encoded = _encode_image(out_img, req.image_format, req.jpeg_quality) encoded = _encode_image(out_img, req.image_format, req.jpeg_quality)
return { return {
"ok": True, "ok": True,
@@ -370,7 +380,7 @@ def screen(
@app.post("/zoom") @app.post("/zoom")
def zoom(req: ZoomRequest, _: None = Depends(_auth)): def zoom(req: ZoomRequest, asImage: bool = False, _: None = Depends(_auth)):
base_img, mon = _capture_screen() base_img, mon = _capture_screen()
cx = req.center_x - mon["x"] cx = req.center_x - mon["x"]
@@ -404,6 +414,11 @@ def zoom(req: ZoomRequest, _: None = Depends(_auth)):
out_img, grid_meta = _draw_grid(crop, region_x, region_y, req.grid_rows, req.grid_cols, req.include_labels) out_img, grid_meta = _draw_grid(crop, region_x, region_y, req.grid_rows, req.grid_cols, req.include_labels)
meta.update(grid_meta) meta.update(grid_meta)
if asImage:
image_bytes = _serialize_image(out_img, req.image_format, req.jpeg_quality)
media_type = "image/jpeg" if req.image_format == "jpeg" else "image/png"
return Response(content=image_bytes, media_type=media_type)
encoded = _encode_image(out_img, req.image_format, req.jpeg_quality) encoded = _encode_image(out_img, req.image_format, req.jpeg_quality)
return { return {