diff --git a/README.md b/README.md index c6ee0be..f858df2 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots - **Window lifecycle endpoints**: list/focus/restore/minimize/maximize/close windows via `GET /windows` + `POST /windows/action` - **Structured launch endpoint**: start an app/process without dropping to a shell via `POST /launch` - **Wait/sync endpoint**: poll for text, window, or visual state changes via `POST /wait` +- **Vision helper endpoints**: compare screenshots and measure stability via `POST /vision/diff` and `POST /vision/stability` - **OCR endpoints**: extract text blocks or search for matching text via `POST /ocr` and `POST /ocr/find` - **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec` - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels diff --git a/docs/API.md b/docs/API.md index 72c8f3a..27c2b66 100644 --- a/docs/API.md +++ b/docs/API.md @@ -282,6 +282,70 @@ Notes: - If `wait_for_window=true`, the server polls for a matching window and returns `window_found`. - `dry_run=true` returns the resolved argv/cwd without launching. +## `POST /vision/diff` + +Measure whether a screen region changed meaningfully between two captures. + +Query params: + +- `screen` (int, default `0`) - used for `mode=screen` and `mode=region` + +Compare live captures: + +```json +{ + "mode": "region", + "region_x": 120, + "region_y": 80, + "region_width": 600, + "region_height": 300, + "delay_ms": 400, + "diff_threshold": 0.01 +} +``` + +Compare provided images: + +```json +{ + "mode": "image", + "before_image_base64": "iVBORw0KGgoAAA...", + "after_image_base64": "iVBORw0KGgoBBB...", + "diff_threshold": 0.01 +} +``` + +Response includes: +- `diff_ratio` — average normalized pixel difference +- `changed` — whether `diff_ratio >= diff_threshold` +- `region` — compared region + +## `POST /vision/stability` + +Measure whether a screen region stays visually stable over a short interval. + +Query params: + +- `screen` (int, default `0`) + +```json +{ + "region_x": 0, + "region_y": 0, + "region_width": 1920, + "region_height": 1080, + "sample_interval_ms": 250, + "duration_ms": 1200, + "diff_threshold": 0.005 +} +``` + +Response includes: +- `stable` +- `sample_count` +- `max_diff_ratio` +- `avg_diff_ratio` + ## `POST /wait` Wait on a structured UI condition instead of guessing sleep durations. diff --git a/server/app.py b/server/app.py index 8822492..30d8564 100644 --- a/server/app.py +++ b/server/app.py @@ -246,6 +246,38 @@ class OCRFindRequest(OCRRequest): max_results: int = Field(default=20, ge=1, le=200) +class VisionDiffRequest(BaseModel): + mode: Literal["screen", "region", "image"] = "screen" + region_x: int | None = Field(default=None, ge=0) + region_y: int | None = Field(default=None, ge=0) + region_width: int | None = Field(default=None, gt=0) + region_height: int | None = Field(default=None, gt=0) + before_image_base64: str | None = None + after_image_base64: str | None = None + delay_ms: int = Field(default=300, ge=0, le=60000) + diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0) + + @model_validator(mode="after") + def _validate_inputs(self): + if self.mode == "region": + required = [self.region_x, self.region_y, self.region_width, self.region_height] + if any(v is None for v in required): + raise ValueError("region_x, region_y, region_width, region_height are required for mode=region") + if self.mode == "image" and (not self.before_image_base64 or not self.after_image_base64): + raise ValueError("before_image_base64 and after_image_base64 are required for mode=image") + return self + + +class VisionStabilityRequest(BaseModel): + region_x: int | None = Field(default=None, ge=0) + region_y: int | None = Field(default=None, ge=0) + region_width: int | None = Field(default=None, gt=0) + region_height: int | None = Field(default=None, gt=0) + sample_interval_ms: int = Field(default=250, ge=50, le=10000) + duration_ms: int = Field(default=1200, ge=0, le=120000) + diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0) + + def _auth(x_clickthrough_token: Optional[str] = Header(default=None)): token = SETTINGS["token"] @@ -898,6 +930,91 @@ def _find_text_matches(blocks: list[dict], query: str, match_mode: str, group_li return matches[:max_results] +def _compute_visual_diff(req: VisionDiffRequest, screen: int = 0) -> dict: + if req.mode == "image": + before = _decode_image_base64(req.before_image_base64 or "") + after = _decode_image_base64(req.after_image_base64 or "") + if before.size != after.size: + raise HTTPException(status_code=400, detail="before and after images must have matching dimensions") + diff_ratio = _image_diff_ratio(before, after) + return { + "mode": req.mode, + "region": {"x": 0, "y": 0, "width": before.size[0], "height": before.size[1]}, + "diff_ratio": diff_ratio, + "changed": diff_ratio >= req.diff_threshold, + "diff_threshold": req.diff_threshold, + } + + before, region, mon, displays, screen_selection = _capture_region_image( + screen, + req.region_x, + req.region_y, + req.region_width, + req.region_height, + ) + if req.delay_ms > 0: + time.sleep(req.delay_ms / 1000.0) + after, _, _, _, _ = _capture_region_image( + screen, + region["x"], + region["y"], + region["width"], + region["height"], + ) + diff_ratio = _image_diff_ratio(before, after) + return { + "mode": req.mode, + "region": region, + "diff_ratio": diff_ratio, + "changed": diff_ratio >= req.diff_threshold, + "diff_threshold": req.diff_threshold, + "screen": screen_selection, + "display": mon, + "delay_ms": req.delay_ms, + } + + +def _measure_stability(req: VisionStabilityRequest, screen: int = 0) -> dict: + baseline, region, mon, displays, screen_selection = _capture_region_image( + screen, + req.region_x, + req.region_y, + req.region_width, + req.region_height, + ) + sample_count = 0 + max_diff_ratio = 0.0 + diffs = [] + deadline = time.time() + (req.duration_ms / 1000.0) + while time.time() < deadline: + time.sleep(req.sample_interval_ms / 1000.0) + current, _, _, _, _ = _capture_region_image( + screen, + region["x"], + region["y"], + region["width"], + region["height"], + ) + diff_ratio = _image_diff_ratio(baseline, current) + diffs.append(diff_ratio) + max_diff_ratio = max(max_diff_ratio, diff_ratio) + sample_count += 1 + baseline = current + + return { + "stable": max_diff_ratio <= req.diff_threshold, + "region": region, + "sample_count": sample_count, + "max_diff_ratio": max_diff_ratio, + "avg_diff_ratio": round(sum(diffs) / len(diffs), 6) if diffs else 0.0, + "diff_threshold": req.diff_threshold, + "duration_ms": req.duration_ms, + "sample_interval_ms": req.sample_interval_ms, + "screen": screen_selection, + "display": mon, + } + + def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict: condition = req.condition deadline = time.time() + (req.timeout_ms / 1000.0) @@ -1433,6 +1550,28 @@ def wait(req: WaitRequest, screen: int = 0, _: None = Depends(_auth)): } +@app.post("/vision/diff") +def vision_diff(req: VisionDiffRequest, screen: int = 0, _: None = Depends(_auth)): + result = _compute_visual_diff(req, screen) + return { + "ok": True, + "request_id": _request_id(), + "time_ms": _now_ms(), + "result": result, + } + + +@app.post("/vision/stability") +def vision_stability(req: VisionStabilityRequest, screen: int = 0, _: None = Depends(_auth)): + result = _measure_stability(req, screen) + return { + "ok": True, + "request_id": _request_id(), + "time_ms": _now_ms(), + "result": result, + } + + @app.post("/ocr") def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)): image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen) diff --git a/skill/SKILL.md b/skill/SKILL.md index c46a66b..53455a3 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -54,6 +54,8 @@ Say what you actually have: screenshots, OCR output, and fresh verification capt - `POST /windows/action` → focus/restore/minimize/maximize/close a matched window - `POST /launch` → start an app/process without dropping to a shell - `POST /wait?screen=0` → wait for text, window, or visual state changes +- `POST /vision/diff?screen=0` → compare screenshots or regions for meaningful visual change +- `POST /vision/stability?screen=0` → measure short-interval visual stability - `POST /ocr` → text extraction with bounding boxes from full screen, region, or provided image bytes - `POST /ocr/find?screen=0` → search OCR output for matching text candidates - `POST /action?screen=0` → single interaction (`move`, `click`, `scroll`, `type`, `hotkey`, ...)