feat(verify): add compound action+verify flows

2026-05-01 16:26:57 +02:00
parent 02bf069425
commit c66779d929
4 changed files with 111 additions and 4 deletions
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
 - **Wait/sync endpoint**: poll for text, window, or visual state changes via `POST /wait`
 - **Vision helper endpoints**: compare screenshots and measure stability via `POST /vision/diff` and `POST /vision/stability`
 - **OCR endpoints**: extract text blocks or search for matching text via `POST /ocr` and `POST /ocr/find`
+- **Compound verify endpoint**: execute an action and wait for a structured success condition via `POST /action/verify`
 - **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
 - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
 - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
@@ -39,8 +40,8 @@ For OCR support, install the native `tesseract` binary on the host (in addition
 2. `GET /screen?screen=0` with grid
 3. Decide cell / target
 4. Optional `POST /zoom?screen=0` for finer targeting
-5. `POST /action?screen=0` to execute
-6. `GET /screen?screen=0` again to verify result, or use `POST /ocr/find` when you need explicit text matching
+5. `POST /action?screen=0` to execute (or `POST /action/verify?screen=0` for a bundled action+wait flow)
+6. `GET /screen?screen=0` again to verify result, or use `POST /wait`, `POST /vision/diff`, or `POST /ocr/find`

 Important:
 - `POST /action` expects an `action` plus a `target` object; do not send raw top-level `x` / `y` fields.
--- a/docs/API.md
+++ b/docs/API.md
@@ -420,6 +420,43 @@ Notes:
 - Window waits build on the structured window discovery endpoint.
 - Visual waits compare repeated captures of either the full selected display or an explicit region.

+## `POST /action/verify`
+
+Execute one action and wait for a structured success condition.
+
+Query params:
+
+- `screen` (int, default `0`)
+
+```json
+{
+  "action": {
+    "action": "click",
+    "target": {"mode": "pixel", "x": 1300, "y": 740}
+  },
+  "condition": {
+    "kind": "text",
+    "mode": "screen",
+    "text": "Settings",
+    "match": "contains",
+    "present": true,
+    "language_hint": "eng",
+    "min_confidence": 0.4
+  },
+  "retries": 1,
+  "timeout_ms": 4000,
+  "poll_interval_ms": 250,
+  "retry_delay_ms": 250
+}
+```
+
+Condition kinds mirror `POST /wait`:
+- `text`
+- `window`
+- `visual`
+
+The response returns per-attempt action output plus structured verification output.
+
 ## `POST /ocr`

 Extract visible text from either a full screenshot, a region crop, or caller-provided image bytes.
--- a/server/app.py
+++ b/server/app.py
@@ -278,6 +278,16 @@ class VisionStabilityRequest(BaseModel):
    diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)


+class VerifyActionRequest(BaseModel):
+    action: ActionRequest
+    condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
+    retries: int = Field(default=0, ge=0, le=10)
+    timeout_ms: int = Field(default=5000, ge=0, le=120000)
+    poll_interval_ms: int = Field(default=250, ge=50, le=10000)
+    retry_delay_ms: int = Field(default=200, ge=0, le=60000)
+    stop_on_action_error: bool = True
+
+

 def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
    token = SETTINGS["token"]
@@ -1015,6 +1025,53 @@ def _measure_stability(req: VisionStabilityRequest, screen: int = 0) -> dict:
    }


+def _run_verified_action(req: VerifyActionRequest, screen: int = 0) -> dict:
+    attempts = []
+    for attempt in range(req.retries + 1):
+        action_ok = True
+        action_result = None
+        action_error = None
+        try:
+            action_result = _exec_action(req.action, screen)
+        except Exception as exc:
+            action_ok = False
+            action_error = str(exc)
+            if req.stop_on_action_error:
+                attempts.append(
+                    {
+                        "attempt": attempt,
+                        "action_ok": action_ok,
+                        "action_error": action_error,
+                        "verification": None,
+                    }
+                )
+                return {"success": False, "attempts": attempts, "final_attempt": attempt}
+
+        verification = _wait_for_condition(
+            WaitRequest(
+                condition=req.condition,
+                timeout_ms=req.timeout_ms,
+                poll_interval_ms=req.poll_interval_ms,
+            ),
+            screen,
+        )
+        attempts.append(
+            {
+                "attempt": attempt,
+                "action_ok": action_ok,
+                "action_error": action_error,
+                "action_result": action_result,
+                "verification": verification,
+            }
+        )
+        if verification.get("satisfied"):
+            return {"success": True, "attempts": attempts, "final_attempt": attempt}
+        if attempt < req.retries and req.retry_delay_ms > 0:
+            time.sleep(req.retry_delay_ms / 1000.0)
+
+    return {"success": False, "attempts": attempts, "final_attempt": req.retries}
+
+
 def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict:
    condition = req.condition
    deadline = time.time() + (req.timeout_ms / 1000.0)
@@ -1572,6 +1629,17 @@ def vision_stability(req: VisionStabilityRequest, screen: int = 0, _: None = Dep
    }


+@app.post("/action/verify")
+def action_verify(req: VerifyActionRequest, screen: int = 0, _: None = Depends(_auth)):
+    result = _run_verified_action(req, screen)
+    return {
+        "ok": result.get("success", False),
+        "request_id": _request_id(),
+        "time_ms": _now_ms(),
+        "result": result,
+    }
+
+
@app.post("/ocr")
 def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
    image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen)
--- a/skill/SKILL.md
+++ b/skill/SKILL.md
@@ -59,6 +59,7 @@ Say what you actually have: screenshots, OCR output, and fresh verification capt
 - `POST /ocr` → text extraction with bounding boxes from full screen, region, or provided image bytes
 - `POST /ocr/find?screen=0` → search OCR output for matching text candidates
 - `POST /action?screen=0` → single interaction (`move`, `click`, `scroll`, `type`, `hotkey`, ...)
+- `POST /action/verify?screen=0` → execute one action plus structured success verification
 - `POST /batch?screen=0` → sequential action list
 - `POST /exec` → PowerShell/Bash/CMD command execution (requires configured exec secret + header)

@@ -93,7 +94,7 @@ Good pattern:
 3. ask a precise question about the visible UI
 4. convert the answer into a concrete Clickthrough target
 5. act once
-6. recapture and verify again
+6. recapture and verify again, or use `POST /action/verify` when the action+postcondition loop is simple enough to bundle cleanly

 Ask narrow questions.
 Good:
@@ -187,7 +188,7 @@ Avoid using `/exec` for routine in-app clicks, menu navigation, or text entry wh
 4. **Before any click**, verify target identity (OCR text/icon/location consistency).
 5. If OCR is insufficient, inspect the screenshot explicitly with the OpenClaw `image` tool instead of pretending you can already see enough.
 6. Execute one minimal action via `POST /action`.
-7. Re-capture with `GET /screen` or use `POST /wait` to verify the expected state change.
+7. Re-capture with `GET /screen` or use `POST /wait`, `POST /vision/diff`, `POST /vision/stability`, or `POST /action/verify` to verify the expected state change.
 8. Repeat until objective is complete.

 ## Verify-before-click rules