diff --git a/TODO.md b/TODO.md index aadb713..5a9f6c9 100644 --- a/TODO.md +++ b/TODO.md @@ -28,8 +28,8 @@ - [x] Clarify user-owned setup responsibilities vs agent responsibilities in skill docs ## Deferred Backlog (Prioritized) -1. [ ] Higher-level automation macros composed from `see` + `interact` + `interact/verify` +1. [ ] Higher-level automation macros composed from `see` + `interact` 2. [ ] Reusable workflow templates (for example: find text -> zoom fallback -> click -> verify) 3. [ ] Batch-safe orchestration primitives with explicit per-step results and auditability -4. [ ] Additional verify primitives beyond `ocr_text_near_point` (image diff region, window title/process state, color/pixel checks) +4. [ ] Additional verify primitives for post-action validation (image diff region, window title/process state, color/pixel checks) 5. [ ] Broader API simplification pass to reduce payload overlap and consolidate shared OCR options diff --git a/docs/API.md b/docs/API.md index 96fd6ca..0cab5cd 100644 --- a/docs/API.md +++ b/docs/API.md @@ -11,7 +11,6 @@ x-clickthrough-token: This API is intended for AI computer control through these methods: - `see` - `interact` -- `interact/verify` - `exec` All responses use one envelope. @@ -178,44 +177,7 @@ Target modes: } ``` -## 3) Interact Verify - -### `POST /interact/verify` -Execute one interact action, then poll quick OCR verification checks until success or timeout. - -```json -{ - "action": { - "screen": 0, - "action": { - "action": "click_text", - "click_text": { - "text": "Apply", - "match": "contains" - } - } - }, - "verify": { - "type": "ocr_text_near_point", - "text": "Applied", - "x": 1180, - "y": 640, - "radius": 120, - "screen": 0, - "match": "contains" - }, - "check_interval_ms": 250, - "timeout_ms": 3000 -} -``` - -Response includes: -- `action_result` -- `verified` -- `attempts` -- `last_check` -- `duration_ms` -## 4) Exec +## 3) Exec ### `POST /exec` Run host shell commands (PowerShell/Bash/CMD). diff --git a/examples/quickstart.py b/examples/quickstart.py index eeb573c..93dbd3d 100644 --- a/examples/quickstart.py +++ b/examples/quickstart.py @@ -58,26 +58,7 @@ def main(): ) click_text.raise_for_status() click_data = click_text.json()["data"] - target = click_data["resolved_target"] - verify = requests.post( - f"{BASE_URL}/interact/verify", - headers=headers, - json={ - "action": {"screen": SCREEN, "action": {"action": "click", "target": {"mode": "pixel", "x": target["x"], "y": target["y"]}}}, - "verify": { - "type": "ocr_text_near_point", - "text": label, - "x": target["x"], - "y": target["y"], - "radius": 150, - "screen": SCREEN, - }, - "timeout_ms": 1500, - }, - timeout=30, - ) - verify.raise_for_status() - print("verify:", verify.json()["data"]["verified"]) + print("clicked:", click_data["resolved_target"]) if __name__ == "__main__": diff --git a/server/app.py b/server/app.py index 3fe7521..434e8f8 100644 --- a/server/app.py +++ b/server/app.py @@ -8,14 +8,13 @@ from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse from .config import SETTINGS -from .models import ExecRequest, InteractRequest, InteractVerifyRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery +from .models import ExecRequest, InteractRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery from .services import ( apply_window_action, capture_region_image, capture_screen, draw_grid, encode_image, - execute_and_verify, extract_ocr_items, exec_action, exec_command as run_exec_command, @@ -159,11 +158,6 @@ def interact(req: InteractRequest, _: None = Depends(_auth)): return _ok(exec_action(req.action, req.screen)) -@app.post("/interact/verify") -def interact_verify(req: InteractVerifyRequest, _: None = Depends(_auth)): - return _ok(execute_and_verify(req)) - - @app.get("/health") def health(_: None = Depends(_auth)): return _ok( diff --git a/server/models.py b/server/models.py index cf8b4d8..0d68f72 100644 --- a/server/models.py +++ b/server/models.py @@ -164,25 +164,4 @@ class ClickTextAction(BaseModel): return self -class VerifyOCRTextNearPoint(BaseModel): - type: Literal["ocr_text_near_point"] - text: str = Field(min_length=1, max_length=1000) - x: int = Field(ge=0) - y: int = Field(ge=0) - radius: int = Field(default=80, ge=1, le=1000) - screen: int = 0 - match: Literal["contains", "exact", "regex"] = "contains" - case_sensitive: bool = False - min_confidence: float = Field(default=0.0, ge=0.0, le=100.0) - ocr_lang: str = Field(default="eng", min_length=1, max_length=64) - ocr_psm: int | None = Field(default=None, ge=0, le=13) - - -class InteractVerifyRequest(BaseModel): - action: InteractRequest - verify: VerifyOCRTextNearPoint - check_interval_ms: int = Field(default=250, ge=50, le=5000) - timeout_ms: int = Field(default=3000, ge=100, le=60000) - - ActionRequest.model_rebuild() diff --git a/server/services.py b/server/services.py index 48524bd..9fdbd73 100644 --- a/server/services.py +++ b/server/services.py @@ -15,11 +15,9 @@ from .models import ( ActionRequest, ClickTextAction, GridTarget, - InteractVerifyRequest, LaunchRequest, PixelTarget, Target, - VerifyOCRTextNearPoint, WindowActionRequest, WindowQuery, ) @@ -357,52 +355,6 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict: } -def _verify_ocr_text_near_point(spec: VerifyOCRTextNearPoint) -> dict: - radius = spec.radius - display, _, _ = select_display(spec.screen) - region_x = max(display["x"], spec.x - radius) - region_y = max(display["y"], spec.y - radius) - max_right = display["x"] + display["width"] - max_bottom = display["y"] + display["height"] - region_right = min(max_right, spec.x + radius) - region_bottom = min(max_bottom, spec.y + radius) - region_w = max(1, region_right - region_x) - region_h = max(1, region_bottom - region_y) - img, region, _, _, screen_selection = capture_region_image(spec.screen, region_x, region_y, region_w, region_h) - items = extract_ocr_items(img, region["x"], region["y"], spec.min_confidence, spec.ocr_lang, spec.ocr_psm) - matches = [item for item in items if _text_matches(item["text"], spec.text, spec.match, spec.case_sensitive)] - return {"ok": len(matches) > 0, "matches": matches[:8], "items_count": len(items), "screen": screen_selection, "region": region} - - -def execute_and_verify(req: InteractVerifyRequest) -> dict: - started = time.time() - action_result = exec_action(req.action.action, req.action.screen) - attempts = 0 - last_check = None - deadline = started + (req.timeout_ms / 1000.0) - while True: - attempts += 1 - check = _verify_ocr_text_near_point(req.verify) - last_check = check - if check["ok"]: - return { - "action_result": action_result, - "verified": True, - "attempts": attempts, - "last_check": last_check, - "duration_ms": int((time.time() - started) * 1000), - } - if time.time() >= deadline: - return { - "action_result": action_result, - "verified": False, - "attempts": attempts, - "last_check": last_check, - "duration_ms": int((time.time() - started) * 1000), - } - time.sleep(req.check_interval_ms / 1000.0) - - def windows_only(feature: str): if sys.platform != "win32": raise HTTPException(status_code=501, detail=f"{feature} is currently supported on Windows hosts only") diff --git a/skill/SKILL.md b/skill/SKILL.md index 38c14cb..418e20a 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -8,7 +8,6 @@ description: Use 3 methods to control a computer: see (screenshot+grid), interac Use these methods: - `see` - `interact` -- `interact/verify` - `exec` ## Method 1: See @@ -41,15 +40,6 @@ Rules: - Use `pixel` only when you already have reliable coordinates. - After each important action, call `see` again before continuing. -## Method 2.5: Action Verify - -Use `POST /interact/verify` to execute one action and immediately validate nearby OCR text state. - -Rules: -- Keep verification narrow: use `ocr_text_near_point` with a focused radius. -- Use short intervals/timeouts for speed (`check_interval_ms` ~250, `timeout_ms` 1000-3000). -- Prefer this over manual re-check loops when immediate confirmation is required. - ## Method 3: Exec Use `POST /exec` only for shell/system tasks. @@ -64,7 +54,7 @@ Rules: 1. `see` capture. 2. If needed, `see/zoom` refine. 3. `interact` one step (`click_text` for text UI targets). -4. `interact/verify` for action->state confirmation, or `see` verify. +4. `see` verify. 5. Repeat. ## Quick Safety Rules diff --git a/tests/test_ocr_and_interact.py b/tests/test_ocr_and_interact.py index 7116008..9efcf4b 100644 --- a/tests/test_ocr_and_interact.py +++ b/tests/test_ocr_and_interact.py @@ -112,37 +112,3 @@ def test_see_ocr_off_on_contract(monkeypatch): on = client.post("/see", json={"ocr": True, "with_grid": False}, headers=_auth_headers()) assert on.status_code == 200 assert on.json()["data"]["meta"]["ocr"][0]["text"] == "x" - - -def test_interact_verify_success_and_timeout(monkeypatch): - calls = {"n": 0} - monkeypatch.setattr(services, "exec_action", lambda action, screen=0: {"action": action.action, "executed": True}) - - def fake_verify(_spec): - calls["n"] += 1 - return {"ok": calls["n"] >= 2, "matches": [], "items_count": 1, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}} - - monkeypatch.setattr(services, "_verify_ocr_text_near_point", fake_verify) - client = TestClient(app) - payload = { - "action": {"screen": 0, "action": {"action": "type", "text": "hello"}}, - "verify": {"type": "ocr_text_near_point", "text": "hello", "x": 100, "y": 100}, - "check_interval_ms": 50, - "timeout_ms": 500, - } - ok_resp = client.post("/interact/verify", json=payload, headers=_auth_headers()) - assert ok_resp.status_code == 200 - ok_data = ok_resp.json()["data"] - assert ok_data["verified"] is True - assert ok_data["attempts"] == 2 - - monkeypatch.setattr( - services, - "_verify_ocr_text_near_point", - lambda _spec: {"ok": False, "matches": [], "items_count": 0, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}}, - ) - timeout_resp = client.post("/interact/verify", json=payload, headers=_auth_headers()) - assert timeout_resp.status_code == 200 - timeout_data = timeout_resp.json()["data"] - assert timeout_data["verified"] is False - assert timeout_data["attempts"] >= 1