From 683f2d59612880159e321a66a5cba5c540f024e1 Mon Sep 17 00:00:00 2001 From: Luna Date: Sun, 5 Apr 2026 20:03:32 +0200 Subject: [PATCH] feat(api): support asImage=true on screen and zoom --- README.md | 4 ++-- docs/API.md | 9 +++++++-- server/app.py | 23 +++++++++++++++++++---- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index dbf6627..62dd3be 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots ## What this provides -- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells -- **Zoom endpoint**: crop around a point with denser grid for fine targeting +- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes) +- **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported) - **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction diff --git a/docs/API.md b/docs/API.md index 659eeaf..4a266b6 100644 --- a/docs/API.md +++ b/docs/API.md @@ -22,8 +22,9 @@ Query params: - `include_labels` (bool, default `true`) - `image_format` (`png`|`jpeg`, default `png`) - `jpeg_quality` (1-100, default `85`) +- `asImage` (bool, default `false`) — if `true`, return raw image bytes only (`image/png` or `image/jpeg`) -Response includes base64 image and metadata (`meta.region`, optional `meta.grid`). +Default response includes base64 image and metadata (`meta.region`, optional `meta.grid`). ## `POST /zoom` @@ -44,7 +45,11 @@ Body: } ``` -Returns cropped image + region metadata in global pixel coordinates. +Query params: + +- `asImage` (bool, default `false`) — if `true`, return raw image bytes only (`image/png` or `image/jpeg`) + +Default response returns cropped image + region metadata in global pixel coordinates. ## `POST /action` diff --git a/server/app.py b/server/app.py index f2fd12f..91867f2 100644 --- a/server/app.py +++ b/server/app.py @@ -5,7 +5,7 @@ import time import uuid from typing import Literal, Optional -from fastapi import Depends, FastAPI, Header, HTTPException +from fastapi import Depends, FastAPI, Header, HTTPException, Response from pydantic import BaseModel, Field, model_validator @@ -160,13 +160,17 @@ def _capture_screen(): return image, {"x": mon["left"], "y": mon["top"], "width": mon["width"], "height": mon["height"]} -def _encode_image(image, image_format: str, jpeg_quality: int) -> str: +def _serialize_image(image, image_format: str, jpeg_quality: int) -> bytes: buf = io.BytesIO() if image_format == "jpeg": image.save(buf, format="JPEG", quality=jpeg_quality) else: image.save(buf, format="PNG") - return base64.b64encode(buf.getvalue()).decode("ascii") + return buf.getvalue() + + +def _encode_image(image, image_format: str, jpeg_quality: int) -> str: + return base64.b64encode(_serialize_image(image, image_format, jpeg_quality)).decode("ascii") def _draw_grid(image, region_x: int, region_y: int, rows: int, cols: int, include_labels: bool): @@ -335,6 +339,7 @@ def screen( include_labels: bool = True, image_format: Literal["png", "jpeg"] = "png", jpeg_quality: int = 85, + asImage: bool = False, _: None = Depends(_auth), ): req = ScreenRequest( @@ -354,6 +359,11 @@ def screen( out_img, grid_meta = _draw_grid(base_img, mon["x"], mon["y"], req.grid_rows, req.grid_cols, req.include_labels) meta.update(grid_meta) + if asImage: + image_bytes = _serialize_image(out_img, req.image_format, req.jpeg_quality) + media_type = "image/jpeg" if req.image_format == "jpeg" else "image/png" + return Response(content=image_bytes, media_type=media_type) + encoded = _encode_image(out_img, req.image_format, req.jpeg_quality) return { "ok": True, @@ -370,7 +380,7 @@ def screen( @app.post("/zoom") -def zoom(req: ZoomRequest, _: None = Depends(_auth)): +def zoom(req: ZoomRequest, asImage: bool = False, _: None = Depends(_auth)): base_img, mon = _capture_screen() cx = req.center_x - mon["x"] @@ -404,6 +414,11 @@ def zoom(req: ZoomRequest, _: None = Depends(_auth)): out_img, grid_meta = _draw_grid(crop, region_x, region_y, req.grid_rows, req.grid_cols, req.include_labels) meta.update(grid_meta) + if asImage: + image_bytes = _serialize_image(out_img, req.image_format, req.jpeg_quality) + media_type = "image/jpeg" if req.image_format == "jpeg" else "image/png" + return Response(content=image_bytes, media_type=media_type) + encoded = _encode_image(out_img, req.image_format, req.jpeg_quality) return {