feat(wait): add structured wait endpoint

2026-05-01 15:55:29 +02:00
parent 493e5499e8
commit 5122d416e8
4 changed files with 304 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
 - **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
 - **Window lifecycle endpoints**: list/focus/restore/minimize/maximize/close windows via `GET /windows` + `POST /windows/action`
 - **Structured launch endpoint**: start an app/process without dropping to a shell via `POST /launch`
 - **Wait/sync endpoint**: poll for text, window, or visual state changes via `POST /wait`
 - **OCR endpoint**: extract text blocks with bounding boxes via `POST /ocr`
 - **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
 - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
@@ -43,7 +44,7 @@ For OCR support, install the native `tesseract` binary on the host (in addition
 Important:
 - `POST /action` expects an `action` plus a `target` object; do not send raw top-level `x` / `y` fields.
 - Pixel coordinates and OCR bounding boxes are always global desktop coordinates.
- Prefer structured GUI interaction first; use `/windows`, `/launch`, and `/action` before reaching for `/exec`.
+- Prefer structured GUI interaction first; use `/windows`, `/launch`, `/wait`, and `/action` before reaching for `/exec`.
 See:
 - `docs/API.md`
--- a/docs/API.md
+++ b/docs/API.md
@@ -277,6 +277,80 @@ Notes:
 - If `wait_for_window=true`, the server polls for a matching window and returns `window_found`.
 - `dry_run=true` returns the resolved argv/cwd without launching.
 ## `POST /wait`
 Wait on a structured UI condition instead of guessing sleep durations.
 Query params:
 - `screen` (int, default `0`) - used for text and visual waits
 ### Wait for text to appear
 ```json
 {
  "condition": {
    "kind": "text",
    "mode": "screen",
    "text": "Scan complete",
    "match": "contains",
    "present": true,
    "language_hint": "eng",
    "min_confidence": 0.4
  },
  "timeout_ms": 15000,
  "poll_interval_ms": 400
 }
 ```
 ### Wait for a window state
 ```json
 {
  "condition": {
    "kind": "window",
    "title_contains": "WinDirStat",
    "visible_only": true,
    "state": "focused"
  },
  "timeout_ms": 5000,
  "poll_interval_ms": 200
 }
 ```
 Window states:
 - `exists`
 - `focused`
 - `closed`
 ### Wait for visual change or stability
 ```json
 {
  "condition": {
    "kind": "visual",
    "state": "stable",
    "region_x": 0,
    "region_y": 0,
    "region_width": 1920,
    "region_height": 1080,
    "diff_threshold": 0.005,
    "stable_for_ms": 1000
  },
  "timeout_ms": 12000,
  "poll_interval_ms": 300
 }
 ```
 Visual states:
 - `change` — succeeds when the average pixel diff crosses `diff_threshold`
 - `stable` — succeeds when the diff stays at or below `diff_threshold` for `stable_for_ms`
 Notes:
 - Text waits reuse the OCR pipeline and return matching OCR blocks on success.
 - Window waits build on the structured window discovery endpoint.
 - Visual waits compare repeated captures of either the full selected display or an explicit region.
 ## `POST /ocr`
 Extract visible text from either a full screenshot, a region crop, or caller-provided image bytes.
--- a/server/app.py
+++ b/server/app.py
@@ -3,6 +3,7 @@ import ctypes
 import hmac
 import io
 import os
 import re
 import subprocess
 import sys
 import time
@@ -11,6 +12,7 @@ from typing import Literal, Optional
 from dotenv import load_dotenv
 from fastapi import Depends, FastAPI, Header, HTTPException, Response
 from PIL import ImageChops, ImageStat
 from pydantic import BaseModel, Field, model_validator
@@ -193,6 +195,50 @@ class LaunchRequest(BaseModel):
    dry_run: bool = False
 class WaitTextCondition(BaseModel):
    kind: Literal["text"]
    mode: Literal["screen", "region"] = "screen"
    text: str = Field(min_length=1, max_length=512)
    match: Literal["contains", "exact", "regex"] = "contains"
    present: bool = True
    region_x: int | None = Field(default=None, ge=0)
    region_y: int | None = Field(default=None, ge=0)
    region_width: int | None = Field(default=None, gt=0)
    region_height: int | None = Field(default=None, gt=0)
    language_hint: str | None = Field(default=None, min_length=1, max_length=64)
    min_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
    @model_validator(mode="after")
    def _validate_region(self):
        if self.mode == "region":
            required = [self.region_x, self.region_y, self.region_width, self.region_height]
            if any(v is None for v in required):
                raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
        return self
 class WaitWindowCondition(WindowQuery):
    kind: Literal["window"]
    state: Literal["exists", "focused", "closed"] = "exists"
 class WaitVisualCondition(BaseModel):
    kind: Literal["visual"]
    state: Literal["change", "stable"] = "change"
    region_x: int | None = Field(default=None, ge=0)
    region_y: int | None = Field(default=None, ge=0)
    region_width: int | None = Field(default=None, gt=0)
    region_height: int | None = Field(default=None, gt=0)
    diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)
    stable_for_ms: int = Field(default=800, ge=0, le=60000)
 class WaitRequest(BaseModel):
    condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
    timeout_ms: int = Field(default=5000, ge=0, le=120000)
    poll_interval_ms: int = Field(default=250, ge=50, le=10000)
 def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
@@ -483,6 +529,18 @@ def _run_ocr(image, language_hint: str | None, min_confidence: float, offset_x:
    return blocks
 def _normalize_text(value: str) -> str:
    return re.sub(r"\s+", " ", value).strip()
 def _matches_text(haystack: str, needle: str, match_mode: str) -> bool:
    if match_mode == "exact":
        return haystack == needle
    if match_mode == "regex":
        return re.search(needle, haystack) is not None
    return needle.lower() in haystack.lower()
 def _windows_only(feature: str):
    if sys.platform != "win32":
        raise HTTPException(status_code=501, detail=f"{feature} is currently supported on Windows hosts only")
@@ -698,6 +756,163 @@ def _launch_app(req: LaunchRequest) -> dict:
    return result
 def _capture_region_image(screen: int, region_x: int | None, region_y: int | None, region_width: int | None, region_height: int | None):
    base_img, mon, displays, screen_selection = _capture_screen(screen)
    if None in {region_x, region_y, region_width, region_height}:
        return base_img, {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}, mon, displays, screen_selection
    left = region_x - mon["x"]
    top = region_y - mon["y"]
    right = left + region_width
    bottom = top + region_height
    if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
        raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")
    crop = base_img.crop((left, top, right, bottom))
    region = {"x": region_x, "y": region_y, "width": region_width, "height": region_height}
    return crop, region, mon, displays, screen_selection
 def _image_diff_ratio(before, after) -> float:
    diff = ImageChops.difference(before, after)
    stat = ImageStat.Stat(diff)
    channel_means = stat.mean if isinstance(stat.mean, list) else [stat.mean]
    return float(sum(channel_means) / (len(channel_means) * 255.0))
 def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict:
    condition = req.condition
    deadline = time.time() + (req.timeout_ms / 1000.0)
    polls = 0
    if isinstance(condition, WaitVisualCondition):
        baseline, region, mon, displays, screen_selection = _capture_region_image(
            screen,
            condition.region_x,
            condition.region_y,
            condition.region_width,
            condition.region_height,
        )
        stable_since = None
        last_diff = 0.0
        while True:
            if time.time() > deadline:
                return {
                    "satisfied": False,
                    "kind": condition.kind,
                    "state": condition.state,
                    "polls": polls,
                    "region": region,
                    "diff_ratio": last_diff,
                    "screen": screen_selection,
                    "display": mon,
                }
            time.sleep(req.poll_interval_ms / 1000.0)
            current, _, _, _, _ = _capture_region_image(
                screen,
                region["x"],
                region["y"],
                region["width"],
                region["height"],
            )
            polls += 1
            last_diff = _image_diff_ratio(baseline, current)
            if condition.state == "change":
                if last_diff >= condition.diff_threshold:
                    return {
                        "satisfied": True,
                        "kind": condition.kind,
                        "state": condition.state,
                        "polls": polls,
                        "region": region,
                        "diff_ratio": last_diff,
                        "screen": screen_selection,
                        "display": mon,
                    }
            else:
                if last_diff <= condition.diff_threshold:
                    stable_since = stable_since or time.time()
                    if (time.time() - stable_since) * 1000 >= condition.stable_for_ms:
                        return {
                            "satisfied": True,
                            "kind": condition.kind,
                            "state": condition.state,
                            "polls": polls,
                            "region": region,
                            "diff_ratio": last_diff,
                            "stable_for_ms": int((time.time() - stable_since) * 1000),
                            "screen": screen_selection,
                            "display": mon,
                        }
                else:
                    stable_since = None
            baseline = current
    while True:
        if isinstance(condition, WaitWindowCondition):
            matches = _list_windows(condition)
            polls += 1
            satisfied = False
            if condition.state == "exists":
                satisfied = bool(matches)
            elif condition.state == "focused":
                satisfied = any(item["foreground"] for item in matches)
            elif condition.state == "closed":
                satisfied = not matches
            if satisfied:
                return {
                    "satisfied": True,
                    "kind": condition.kind,
                    "state": condition.state,
                    "polls": polls,
                    "matches": matches[:10],
                }
        elif isinstance(condition, WaitTextCondition):
            image, region, mon, displays, screen_selection = _capture_region_image(
                screen,
                condition.region_x,
                condition.region_y,
                condition.region_width,
                condition.region_height,
            )
            blocks = _run_ocr(
                image,
                condition.language_hint,
                condition.min_confidence,
                region["x"],
                region["y"],
            )
            polls += 1
            matched = []
            for block in blocks:
                normalized = _normalize_text(block["text"])
                target = _normalize_text(condition.text)
                if _matches_text(normalized, target, condition.match):
                    matched.append(block)
            satisfied = bool(matched) if condition.present else not bool(matched)
            if satisfied:
                return {
                    "satisfied": True,
                    "kind": condition.kind,
                    "mode": condition.mode,
                    "polls": polls,
                    "region": region,
                    "matches": matched,
                    "screen": screen_selection,
                    "display": mon,
                }
        else:
            raise HTTPException(status_code=400, detail="unsupported wait condition")
        if time.time() > deadline:
            return {
                "satisfied": False,
                "kind": condition.kind,
                "polls": polls,
            }
        time.sleep(req.poll_interval_ms / 1000.0)
 def _pick_shell(explicit_shell: str | None) -> str:
    shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip()
    if shell_name not in {"powershell", "bash", "cmd"}:
@@ -1089,6 +1304,17 @@ def launch(req: LaunchRequest, _: None = Depends(_auth)):
    }
@app.post("/wait")
 def wait(req: WaitRequest, screen: int = 0, _: None = Depends(_auth)):
    result = _wait_for_condition(req, screen)
    return {
        "ok": result.get("satisfied", False),
        "request_id": _request_id(),
        "time_ms": _now_ms(),
        "result": result,
    }
@app.post("/ocr")
 def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
    source = req.mode
--- a/skill/SKILL.md
+++ b/skill/SKILL.md
@@ -39,6 +39,7 @@ The agent should not assume it can self-install this stack.
 - `GET /windows` → discover visible desktop windows and their handles/processes
 - `POST /windows/action` → focus/restore/minimize/maximize/close a matched window
 - `POST /launch` → start an app/process without dropping to a shell
 - `POST /wait?screen=0` → wait for text, window, or visual state changes
 - `POST /ocr` → text extraction with bounding boxes from full screen, region, or provided image bytes
 - `POST /action?screen=0` → single interaction (`move`, `click`, `scroll`, `type`, `hotkey`, ...)
 - `POST /batch?screen=0` → sequential action list
@@ -140,7 +141,7 @@ Avoid using `/exec` for routine in-app clicks, menu navigation, or text entry wh
 3. If confidence < 0.85, call `POST /zoom` with denser grid (e.g., 20x20) and re-evaluate.
 4. **Before any click**, verify target identity (OCR text/icon/location consistency).
 5. Execute one minimal action via `POST /action`.
-6. Re-capture with `GET /screen` and verify the expected state change.
+6. Re-capture with `GET /screen` or use `POST /wait` to verify the expected state change.
 7. Repeat until objective is complete.
 ## Verify-before-click rules