diff --git a/README.md b/README.md index f858df2..f0be27e 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots - **Wait/sync endpoint**: poll for text, window, or visual state changes via `POST /wait` - **Vision helper endpoints**: compare screenshots and measure stability via `POST /vision/diff` and `POST /vision/stability` - **OCR endpoints**: extract text blocks or search for matching text via `POST /ocr` and `POST /ocr/find` +- **Compound verify endpoint**: execute an action and wait for a structured success condition via `POST /action/verify` - **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec` - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction @@ -39,8 +40,8 @@ For OCR support, install the native `tesseract` binary on the host (in addition 2. `GET /screen?screen=0` with grid 3. Decide cell / target 4. Optional `POST /zoom?screen=0` for finer targeting -5. `POST /action?screen=0` to execute -6. `GET /screen?screen=0` again to verify result, or use `POST /ocr/find` when you need explicit text matching +5. `POST /action?screen=0` to execute (or `POST /action/verify?screen=0` for a bundled action+wait flow) +6. `GET /screen?screen=0` again to verify result, or use `POST /wait`, `POST /vision/diff`, or `POST /ocr/find` Important: - `POST /action` expects an `action` plus a `target` object; do not send raw top-level `x` / `y` fields. diff --git a/docs/API.md b/docs/API.md index 27c2b66..76cfa59 100644 --- a/docs/API.md +++ b/docs/API.md @@ -420,6 +420,43 @@ Notes: - Window waits build on the structured window discovery endpoint. - Visual waits compare repeated captures of either the full selected display or an explicit region. +## `POST /action/verify` + +Execute one action and wait for a structured success condition. + +Query params: + +- `screen` (int, default `0`) + +```json +{ + "action": { + "action": "click", + "target": {"mode": "pixel", "x": 1300, "y": 740} + }, + "condition": { + "kind": "text", + "mode": "screen", + "text": "Settings", + "match": "contains", + "present": true, + "language_hint": "eng", + "min_confidence": 0.4 + }, + "retries": 1, + "timeout_ms": 4000, + "poll_interval_ms": 250, + "retry_delay_ms": 250 +} +``` + +Condition kinds mirror `POST /wait`: +- `text` +- `window` +- `visual` + +The response returns per-attempt action output plus structured verification output. + ## `POST /ocr` Extract visible text from either a full screenshot, a region crop, or caller-provided image bytes. diff --git a/server/app.py b/server/app.py index 30d8564..dca2378 100644 --- a/server/app.py +++ b/server/app.py @@ -278,6 +278,16 @@ class VisionStabilityRequest(BaseModel): diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0) +class VerifyActionRequest(BaseModel): + action: ActionRequest + condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition + retries: int = Field(default=0, ge=0, le=10) + timeout_ms: int = Field(default=5000, ge=0, le=120000) + poll_interval_ms: int = Field(default=250, ge=50, le=10000) + retry_delay_ms: int = Field(default=200, ge=0, le=60000) + stop_on_action_error: bool = True + + def _auth(x_clickthrough_token: Optional[str] = Header(default=None)): token = SETTINGS["token"] @@ -1015,6 +1025,53 @@ def _measure_stability(req: VisionStabilityRequest, screen: int = 0) -> dict: } +def _run_verified_action(req: VerifyActionRequest, screen: int = 0) -> dict: + attempts = [] + for attempt in range(req.retries + 1): + action_ok = True + action_result = None + action_error = None + try: + action_result = _exec_action(req.action, screen) + except Exception as exc: + action_ok = False + action_error = str(exc) + if req.stop_on_action_error: + attempts.append( + { + "attempt": attempt, + "action_ok": action_ok, + "action_error": action_error, + "verification": None, + } + ) + return {"success": False, "attempts": attempts, "final_attempt": attempt} + + verification = _wait_for_condition( + WaitRequest( + condition=req.condition, + timeout_ms=req.timeout_ms, + poll_interval_ms=req.poll_interval_ms, + ), + screen, + ) + attempts.append( + { + "attempt": attempt, + "action_ok": action_ok, + "action_error": action_error, + "action_result": action_result, + "verification": verification, + } + ) + if verification.get("satisfied"): + return {"success": True, "attempts": attempts, "final_attempt": attempt} + if attempt < req.retries and req.retry_delay_ms > 0: + time.sleep(req.retry_delay_ms / 1000.0) + + return {"success": False, "attempts": attempts, "final_attempt": req.retries} + + def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict: condition = req.condition deadline = time.time() + (req.timeout_ms / 1000.0) @@ -1572,6 +1629,17 @@ def vision_stability(req: VisionStabilityRequest, screen: int = 0, _: None = Dep } +@app.post("/action/verify") +def action_verify(req: VerifyActionRequest, screen: int = 0, _: None = Depends(_auth)): + result = _run_verified_action(req, screen) + return { + "ok": result.get("success", False), + "request_id": _request_id(), + "time_ms": _now_ms(), + "result": result, + } + + @app.post("/ocr") def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)): image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen) diff --git a/skill/SKILL.md b/skill/SKILL.md index 53455a3..20b9003 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -59,6 +59,7 @@ Say what you actually have: screenshots, OCR output, and fresh verification capt - `POST /ocr` → text extraction with bounding boxes from full screen, region, or provided image bytes - `POST /ocr/find?screen=0` → search OCR output for matching text candidates - `POST /action?screen=0` → single interaction (`move`, `click`, `scroll`, `type`, `hotkey`, ...) +- `POST /action/verify?screen=0` → execute one action plus structured success verification - `POST /batch?screen=0` → sequential action list - `POST /exec` → PowerShell/Bash/CMD command execution (requires configured exec secret + header) @@ -93,7 +94,7 @@ Good pattern: 3. ask a precise question about the visible UI 4. convert the answer into a concrete Clickthrough target 5. act once -6. recapture and verify again +6. recapture and verify again, or use `POST /action/verify` when the action+postcondition loop is simple enough to bundle cleanly Ask narrow questions. Good: @@ -187,7 +188,7 @@ Avoid using `/exec` for routine in-app clicks, menu navigation, or text entry wh 4. **Before any click**, verify target identity (OCR text/icon/location consistency). 5. If OCR is insufficient, inspect the screenshot explicitly with the OpenClaw `image` tool instead of pretending you can already see enough. 6. Execute one minimal action via `POST /action`. -7. Re-capture with `GET /screen` or use `POST /wait` to verify the expected state change. +7. Re-capture with `GET /screen` or use `POST /wait`, `POST /vision/diff`, `POST /vision/stability`, or `POST /action/verify` to verify the expected state change. 8. Repeat until objective is complete. ## Verify-before-click rules