feat(api): support asImage=true on screen and zoom
All checks were successful
python-syntax / syntax-check (push) Successful in 4s
All checks were successful
python-syntax / syntax-check (push) Successful in 4s
This commit is contained in:
@@ -4,8 +4,8 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
|
|||||||
|
|
||||||
## What this provides
|
## What this provides
|
||||||
|
|
||||||
- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells
|
- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes)
|
||||||
- **Zoom endpoint**: crop around a point with denser grid for fine targeting
|
- **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported)
|
||||||
- **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
|
- **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
|
||||||
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
|
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
|
||||||
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
|
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
|
||||||
|
|||||||
@@ -22,8 +22,9 @@ Query params:
|
|||||||
- `include_labels` (bool, default `true`)
|
- `include_labels` (bool, default `true`)
|
||||||
- `image_format` (`png`|`jpeg`, default `png`)
|
- `image_format` (`png`|`jpeg`, default `png`)
|
||||||
- `jpeg_quality` (1-100, default `85`)
|
- `jpeg_quality` (1-100, default `85`)
|
||||||
|
- `asImage` (bool, default `false`) — if `true`, return raw image bytes only (`image/png` or `image/jpeg`)
|
||||||
|
|
||||||
Response includes base64 image and metadata (`meta.region`, optional `meta.grid`).
|
Default response includes base64 image and metadata (`meta.region`, optional `meta.grid`).
|
||||||
|
|
||||||
## `POST /zoom`
|
## `POST /zoom`
|
||||||
|
|
||||||
@@ -44,7 +45,11 @@ Body:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Returns cropped image + region metadata in global pixel coordinates.
|
Query params:
|
||||||
|
|
||||||
|
- `asImage` (bool, default `false`) — if `true`, return raw image bytes only (`image/png` or `image/jpeg`)
|
||||||
|
|
||||||
|
Default response returns cropped image + region metadata in global pixel coordinates.
|
||||||
|
|
||||||
## `POST /action`
|
## `POST /action`
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
from typing import Literal, Optional
|
from typing import Literal, Optional
|
||||||
|
|
||||||
from fastapi import Depends, FastAPI, Header, HTTPException
|
from fastapi import Depends, FastAPI, Header, HTTPException, Response
|
||||||
from pydantic import BaseModel, Field, model_validator
|
from pydantic import BaseModel, Field, model_validator
|
||||||
|
|
||||||
|
|
||||||
@@ -160,13 +160,17 @@ def _capture_screen():
|
|||||||
return image, {"x": mon["left"], "y": mon["top"], "width": mon["width"], "height": mon["height"]}
|
return image, {"x": mon["left"], "y": mon["top"], "width": mon["width"], "height": mon["height"]}
|
||||||
|
|
||||||
|
|
||||||
def _encode_image(image, image_format: str, jpeg_quality: int) -> str:
|
def _serialize_image(image, image_format: str, jpeg_quality: int) -> bytes:
|
||||||
buf = io.BytesIO()
|
buf = io.BytesIO()
|
||||||
if image_format == "jpeg":
|
if image_format == "jpeg":
|
||||||
image.save(buf, format="JPEG", quality=jpeg_quality)
|
image.save(buf, format="JPEG", quality=jpeg_quality)
|
||||||
else:
|
else:
|
||||||
image.save(buf, format="PNG")
|
image.save(buf, format="PNG")
|
||||||
return base64.b64encode(buf.getvalue()).decode("ascii")
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def _encode_image(image, image_format: str, jpeg_quality: int) -> str:
|
||||||
|
return base64.b64encode(_serialize_image(image, image_format, jpeg_quality)).decode("ascii")
|
||||||
|
|
||||||
|
|
||||||
def _draw_grid(image, region_x: int, region_y: int, rows: int, cols: int, include_labels: bool):
|
def _draw_grid(image, region_x: int, region_y: int, rows: int, cols: int, include_labels: bool):
|
||||||
@@ -335,6 +339,7 @@ def screen(
|
|||||||
include_labels: bool = True,
|
include_labels: bool = True,
|
||||||
image_format: Literal["png", "jpeg"] = "png",
|
image_format: Literal["png", "jpeg"] = "png",
|
||||||
jpeg_quality: int = 85,
|
jpeg_quality: int = 85,
|
||||||
|
asImage: bool = False,
|
||||||
_: None = Depends(_auth),
|
_: None = Depends(_auth),
|
||||||
):
|
):
|
||||||
req = ScreenRequest(
|
req = ScreenRequest(
|
||||||
@@ -354,6 +359,11 @@ def screen(
|
|||||||
out_img, grid_meta = _draw_grid(base_img, mon["x"], mon["y"], req.grid_rows, req.grid_cols, req.include_labels)
|
out_img, grid_meta = _draw_grid(base_img, mon["x"], mon["y"], req.grid_rows, req.grid_cols, req.include_labels)
|
||||||
meta.update(grid_meta)
|
meta.update(grid_meta)
|
||||||
|
|
||||||
|
if asImage:
|
||||||
|
image_bytes = _serialize_image(out_img, req.image_format, req.jpeg_quality)
|
||||||
|
media_type = "image/jpeg" if req.image_format == "jpeg" else "image/png"
|
||||||
|
return Response(content=image_bytes, media_type=media_type)
|
||||||
|
|
||||||
encoded = _encode_image(out_img, req.image_format, req.jpeg_quality)
|
encoded = _encode_image(out_img, req.image_format, req.jpeg_quality)
|
||||||
return {
|
return {
|
||||||
"ok": True,
|
"ok": True,
|
||||||
@@ -370,7 +380,7 @@ def screen(
|
|||||||
|
|
||||||
|
|
||||||
@app.post("/zoom")
|
@app.post("/zoom")
|
||||||
def zoom(req: ZoomRequest, _: None = Depends(_auth)):
|
def zoom(req: ZoomRequest, asImage: bool = False, _: None = Depends(_auth)):
|
||||||
base_img, mon = _capture_screen()
|
base_img, mon = _capture_screen()
|
||||||
|
|
||||||
cx = req.center_x - mon["x"]
|
cx = req.center_x - mon["x"]
|
||||||
@@ -404,6 +414,11 @@ def zoom(req: ZoomRequest, _: None = Depends(_auth)):
|
|||||||
out_img, grid_meta = _draw_grid(crop, region_x, region_y, req.grid_rows, req.grid_cols, req.include_labels)
|
out_img, grid_meta = _draw_grid(crop, region_x, region_y, req.grid_rows, req.grid_cols, req.include_labels)
|
||||||
meta.update(grid_meta)
|
meta.update(grid_meta)
|
||||||
|
|
||||||
|
if asImage:
|
||||||
|
image_bytes = _serialize_image(out_img, req.image_format, req.jpeg_quality)
|
||||||
|
media_type = "image/jpeg" if req.image_format == "jpeg" else "image/png"
|
||||||
|
return Response(content=image_bytes, media_type=media_type)
|
||||||
|
|
||||||
encoded = _encode_image(out_img, req.image_format, req.jpeg_quality)
|
encoded = _encode_image(out_img, req.image_format, req.jpeg_quality)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user