diff --git a/TODO.md b/TODO.md index bc40182..22542db 100644 --- a/TODO.md +++ b/TODO.md @@ -26,3 +26,8 @@ - [x] Upgrade skill with verify-before-click rules, confidence thresholds, two-phase risky actions, and Spotify playbook - [x] Add top-level skill section for instance setup + mini API docs - [x] Clarify user-owned setup responsibilities vs agent responsibilities in skill docs + +## Deferred Backlog +- [ ] Higher-level task macros composed from `see` + `interact` + `interact/verify` primitives +- [ ] Additional verify primitives beyond `ocr_text_near_point` (image-diff region, window title/process state, color/pixel checks) +- [ ] Broader API simplification pass to reduce payload overlap and consolidate shared OCR options diff --git a/docs/API.md b/docs/API.md index 41b46d3..96fd6ca 100644 --- a/docs/API.md +++ b/docs/API.md @@ -8,9 +8,10 @@ Auth header when enabled: x-clickthrough-token: ``` -This API is intended for AI computer control through 3 methods only: +This API is intended for AI computer control through these methods: - `see` - `interact` +- `interact/verify` - `exec` All responses use one envelope. @@ -62,7 +63,11 @@ Capture a full screen or a region. Optional grid overlay returns coordinate meta "grid_cols": 12, "include_labels": true, "image_format": "png", - "jpeg_quality": 85 + "jpeg_quality": 85, + "ocr": false, + "ocr_min_confidence": 0, + "ocr_lang": "eng", + "ocr_psm": null } ``` @@ -70,6 +75,14 @@ Returns: - `data.image.base64` - `data.meta.region` (global desktop coords) - `data.meta.grid` (rows/cols/cell size + formula) +- `data.meta.ocr` (when `ocr=true`) + +OCR item shape: +- `text` +- `confidence` +- `bbox` (global coords) +- `center` +- `region_relative_bbox` ### `POST /see/zoom` Capture a tighter crop around a global point and draw another grid over that crop. @@ -126,12 +139,83 @@ Supported actions: - `scroll` (`scroll_amount`) - `type` (`text`, `interval_ms`) - `hotkey` (`keys`) +- `click_text` (OCR-driven text click with optional region) Target modes: - `pixel`: absolute global `x,y` - `grid`: grid cell from a `see`/`see/zoom` response -## 3) Exec +### `click_text` example (full screen OCR) +```json +{ + "screen": 0, + "action": { + "action": "click_text", + "click_text": { + "text": "Sign in", + "match": "contains", + "case_sensitive": false, + "min_confidence": 45, + "occurrence": "best" + } + } +} +``` + +### `click_text` example (region OCR) +```json +{ + "screen": 0, + "action": { + "action": "click_text", + "click_text": { + "text": "Continue", + "match": "exact", + "region": { "x": 940, "y": 520, "width": 400, "height": 260 }, + "occurrence": "first" + } + } +} +``` + +## 3) Interact Verify + +### `POST /interact/verify` +Execute one interact action, then poll quick OCR verification checks until success or timeout. + +```json +{ + "action": { + "screen": 0, + "action": { + "action": "click_text", + "click_text": { + "text": "Apply", + "match": "contains" + } + } + }, + "verify": { + "type": "ocr_text_near_point", + "text": "Applied", + "x": 1180, + "y": 640, + "radius": 120, + "screen": 0, + "match": "contains" + }, + "check_interval_ms": 250, + "timeout_ms": 3000 +} +``` + +Response includes: +- `action_result` +- `verified` +- `attempts` +- `last_check` +- `duration_ms` +## 4) Exec ### `POST /exec` Run host shell commands (PowerShell/Bash/CMD). diff --git a/examples/quickstart.py b/examples/quickstart.py index a465fa2..eeb573c 100644 --- a/examples/quickstart.py +++ b/examples/quickstart.py @@ -35,6 +35,50 @@ def main(): print("region:", payload["meta"]["region"]) print("grid:", payload["meta"].get("grid", {})) + see_ocr = requests.post( + f"{BASE_URL}/see", + headers=headers, + json={"screen": SCREEN, "ocr": True, "with_grid": False, "ocr_min_confidence": 40}, + timeout=30, + ) + see_ocr.raise_for_status() + ocr_items = see_ocr.json()["data"]["meta"].get("ocr", []) + print("ocr_items:", len(ocr_items)) + + if ocr_items: + label = ocr_items[0]["text"] + click_text = requests.post( + f"{BASE_URL}/interact", + headers=headers, + json={ + "screen": SCREEN, + "action": {"action": "click_text", "click_text": {"text": label, "match": "exact", "occurrence": "first"}}, + }, + timeout=30, + ) + click_text.raise_for_status() + click_data = click_text.json()["data"] + target = click_data["resolved_target"] + verify = requests.post( + f"{BASE_URL}/interact/verify", + headers=headers, + json={ + "action": {"screen": SCREEN, "action": {"action": "click", "target": {"mode": "pixel", "x": target["x"], "y": target["y"]}}}, + "verify": { + "type": "ocr_text_near_point", + "text": label, + "x": target["x"], + "y": target["y"], + "radius": 150, + "screen": SCREEN, + }, + "timeout_ms": 1500, + }, + timeout=30, + ) + verify.raise_for_status() + print("verify:", verify.json()["data"]["verified"]) + if __name__ == "__main__": main() diff --git a/server/app.py b/server/app.py index 95d9f08..3fe7521 100644 --- a/server/app.py +++ b/server/app.py @@ -8,13 +8,15 @@ from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse from .config import SETTINGS -from .models import ExecRequest, InteractRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery +from .models import ExecRequest, InteractRequest, InteractVerifyRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery from .services import ( apply_window_action, capture_region_image, capture_screen, draw_grid, encode_image, + execute_and_verify, + extract_ocr_items, exec_action, exec_command as run_exec_command, get_displays, @@ -65,7 +67,8 @@ async def _http_exception_handler(_: Request, exc: HTTPException): detail = exc.detail if isinstance(detail, dict): message = str(detail.get("message", "request failed")) - return _err("http_error", message, exc.status_code, detail) + code = str(detail.get("code", "http_error")) + return _err(code, message, exc.status_code, detail.get("details")) return _err("http_error", str(detail), exc.status_code) @@ -99,6 +102,8 @@ def see(req: SeeRequest, _: None = Depends(_auth)): if req.with_grid: out_img, grid_meta = draw_grid(image, region["x"], region["y"], req.grid_rows, req.grid_cols, req.include_labels) meta.update(grid_meta) + if req.ocr: + meta["ocr"] = extract_ocr_items(image, region["x"], region["y"], req.ocr_min_confidence, req.ocr_lang, req.ocr_psm) return _ok( { "image": { @@ -154,6 +159,11 @@ def interact(req: InteractRequest, _: None = Depends(_auth)): return _ok(exec_action(req.action, req.screen)) +@app.post("/interact/verify") +def interact_verify(req: InteractVerifyRequest, _: None = Depends(_auth)): + return _ok(execute_and_verify(req)) + + @app.get("/health") def health(_: None = Depends(_auth)): return _ok( diff --git a/server/models.py b/server/models.py index 0871c53..cf8b4d8 100644 --- a/server/models.py +++ b/server/models.py @@ -48,6 +48,7 @@ class ActionRequest(BaseModel): "scroll", "type", "hotkey", + "click_text", ] target: Optional[Target] = None duration_ms: int = Field(default=0, ge=0, le=20000) @@ -58,6 +59,13 @@ class ActionRequest(BaseModel): keys: list[str] = Field(default_factory=list) interval_ms: int = Field(default=20, ge=0, le=5000) dry_run: bool = False + click_text: "ClickTextAction | None" = None + + @model_validator(mode="after") + def _validate_click_text(self): + if self.action == "click_text" and self.click_text is None: + raise ValueError("click_text payload is required when action=click_text") + return self class ExecRequest(BaseModel): @@ -103,6 +111,10 @@ class SeeRequest(BaseModel): include_labels: bool = True image_format: Literal["png", "jpeg"] = "png" jpeg_quality: int = Field(default=85, ge=1, le=100) + ocr: bool = False + ocr_min_confidence: float = Field(default=0.0, ge=0.0, le=100.0) + ocr_lang: str = Field(default="eng", min_length=1, max_length=64) + ocr_psm: int | None = Field(default=None, ge=0, le=13) class SeeZoomRequest(BaseModel): @@ -122,3 +134,55 @@ class SeeZoomRequest(BaseModel): class InteractRequest(BaseModel): screen: int = 0 action: ActionRequest + + +class OCRRegion(BaseModel): + x: int = Field(ge=0) + y: int = Field(ge=0) + width: int = Field(gt=0) + height: int = Field(gt=0) + + +class ClickTextAction(BaseModel): + text: str = Field(min_length=1, max_length=1000) + match: Literal["contains", "exact", "regex"] = "contains" + region: OCRRegion | None = None + screen: int | None = None + case_sensitive: bool = False + min_confidence: float = Field(default=0.0, ge=0.0, le=100.0) + occurrence: Literal["first", "best", "nth"] = "first" + nth: int | None = Field(default=None, ge=1, le=10000) + ocr_lang: str = Field(default="eng", min_length=1, max_length=64) + ocr_psm: int | None = Field(default=None, ge=0, le=13) + + @model_validator(mode="after") + def _validate_nth(self): + if self.occurrence == "nth" and self.nth is None: + raise ValueError("nth is required when occurrence=nth") + if self.occurrence != "nth" and self.nth is not None: + raise ValueError("nth is only allowed when occurrence=nth") + return self + + +class VerifyOCRTextNearPoint(BaseModel): + type: Literal["ocr_text_near_point"] + text: str = Field(min_length=1, max_length=1000) + x: int = Field(ge=0) + y: int = Field(ge=0) + radius: int = Field(default=80, ge=1, le=1000) + screen: int = 0 + match: Literal["contains", "exact", "regex"] = "contains" + case_sensitive: bool = False + min_confidence: float = Field(default=0.0, ge=0.0, le=100.0) + ocr_lang: str = Field(default="eng", min_length=1, max_length=64) + ocr_psm: int | None = Field(default=None, ge=0, le=13) + + +class InteractVerifyRequest(BaseModel): + action: InteractRequest + verify: VerifyOCRTextNearPoint + check_interval_ms: int = Field(default=250, ge=50, le=5000) + timeout_ms: int = Field(default=3000, ge=100, le=60000) + + +ActionRequest.model_rebuild() diff --git a/server/services.py b/server/services.py index b0828cf..48524bd 100644 --- a/server/services.py +++ b/server/services.py @@ -11,7 +11,22 @@ from fastapi import HTTPException from PIL import ImageChops, ImageStat from .config import SETTINGS -from .models import ActionRequest, GridTarget, LaunchRequest, PixelTarget, Target, WindowActionRequest, WindowQuery +from .models import ( + ActionRequest, + ClickTextAction, + GridTarget, + InteractVerifyRequest, + LaunchRequest, + PixelTarget, + Target, + VerifyOCRTextNearPoint, + WindowActionRequest, + WindowQuery, +) + + +def api_error(status_code: int, code: str, message: str, details=None): + raise HTTPException(status_code=status_code, detail={"code": code, "message": message, "details": details}) def import_capture_libs(): @@ -85,6 +100,50 @@ def capture_region_image(screen: int, region_x: int | None, region_y: int | None return crop, {"x": region_x, "y": region_y, "width": region_width, "height": region_height}, mon, displays, screen_selection +def extract_ocr_items(image, origin_x: int, origin_y: int, min_confidence: float, lang: str, psm: int | None) -> list[dict]: + try: + import pytesseract + except Exception as exc: + api_error(503, "ocr_unavailable", f"pytesseract unavailable: {exc}") + + config = "" + if psm is not None: + config = f"--psm {psm}" + try: + data = pytesseract.image_to_data(image, lang=lang, config=config, output_type=pytesseract.Output.DICT) + except Exception as exc: + api_error(503, "ocr_failed", f"ocr failed: {exc}") + + out: list[dict] = [] + n = len(data.get("text", [])) + for i in range(n): + text = (data["text"][i] or "").strip() + if not text: + continue + try: + confidence = float(data["conf"][i]) + except Exception: + continue + if confidence < min_confidence: + continue + left = int(data["left"][i]) + top = int(data["top"][i]) + width = int(data["width"][i]) + height = int(data["height"][i]) + bbox = {"x": origin_x + left, "y": origin_y + top, "width": width, "height": height} + center = {"x": bbox["x"] + (width // 2), "y": bbox["y"] + (height // 2)} + out.append( + { + "text": text, + "confidence": confidence, + "bbox": bbox, + "center": center, + "region_relative_bbox": {"x": left, "y": top, "width": width, "height": height}, + } + ) + return out + + def serialize_image(image, image_format: str, jpeg_quality: int) -> bytes: buf = io.BytesIO() if image_format == "jpeg": @@ -164,6 +223,39 @@ def enforce_allowed_region(x: int, y: int): raise HTTPException(status_code=403, detail="point outside allowed region") +def _text_matches(candidate: str, needle: str, mode: str, case_sensitive: bool) -> bool: + hay = candidate if case_sensitive else candidate.lower() + ndl = needle if case_sensitive else needle.lower() + if mode == "contains": + return ndl in hay + if mode == "exact": + return hay == ndl + flags = 0 if case_sensitive else re.IGNORECASE + return re.search(needle, candidate, flags=flags) is not None + + +def _resolve_text_match(click_text: ClickTextAction, items: list[dict]) -> dict: + matches = [item for item in items if _text_matches(item["text"], click_text.text, click_text.match, click_text.case_sensitive)] + if not matches: + candidates = [item["text"] for item in sorted(items, key=lambda v: v["confidence"], reverse=True)[:8]] + api_error(404, "ocr_text_not_found", "no OCR text matched", {"query": click_text.text, "candidates": candidates}) + if click_text.occurrence == "best": + return max(matches, key=lambda item: item["confidence"]) + if click_text.occurrence == "nth": + idx = (click_text.nth or 1) - 1 + if idx >= len(matches): + api_error(409, "ocr_nth_out_of_range", "requested nth match is out of range", {"match_count": len(matches), "nth": click_text.nth}) + return matches[idx] + if len(matches) > 1 and click_text.match == "exact": + api_error( + 409, + "ocr_text_ambiguous", + "multiple OCR entries matched", + {"match_count": len(matches), "candidates": [item["text"] for item in matches[:8]]}, + ) + return matches[0] + + def import_input_lib(): try: import pyautogui @@ -176,7 +268,10 @@ def import_input_lib(): def exec_action(req: ActionRequest, screen: int = 0) -> dict: run_dry = SETTINGS["dry_run"] or req.dry_run - selected_display, _, screen_selection = select_display(screen) + action_screen = screen + if req.action == "click_text" and req.click_text and req.click_text.screen is not None: + action_screen = req.click_text.screen + selected_display, _, screen_selection = select_display(action_screen) pyautogui = None if run_dry else import_input_lib() resolved_target = None @@ -191,6 +286,36 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict: if req.action == "scroll" and resolved_target is None: raise HTTPException(status_code=400, detail="target is required for scroll") + click_text_match = None + if req.action == "click_text": + if req.click_text is None: + api_error(400, "click_text_payload_required", "click_text payload is required") + region = req.click_text.region + img, captured_region, _, _, _ = capture_region_image( + action_screen, + None if region is None else region.x, + None if region is None else region.y, + None if region is None else region.width, + None if region is None else region.height, + ) + items = extract_ocr_items( + img, + captured_region["x"], + captured_region["y"], + req.click_text.min_confidence, + req.click_text.ocr_lang, + req.click_text.ocr_psm, + ) + matched = _resolve_text_match(req.click_text, items) + enforce_allowed_region(matched["center"]["x"], matched["center"]["y"]) + click_text_match = { + "query": req.click_text.model_dump(), + "matched": matched, + "capture_region": captured_region, + "screen": screen_selection, + } + resolved_target = {"x": matched["center"]["x"], "y": matched["center"]["y"], "target_info": {"mode": "ocr_text"}} + if not run_dry: if req.action == "move": pyautogui.moveTo(resolved_target["x"], resolved_target["y"], duration=duration_sec) @@ -211,8 +336,71 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict: if len(req.keys) < 1: raise HTTPException(status_code=400, detail="keys is required for hotkey") pyautogui.hotkey(*req.keys) + elif req.action == "click_text": + pyautogui.click( + x=resolved_target["x"], + y=resolved_target["y"], + clicks=req.clicks, + interval=req.interval_ms / 1000.0, + button=req.button, + duration=duration_sec, + ) - return {"action": req.action, "executed": not run_dry, "dry_run": run_dry, "screen": screen_selection, "display": selected_display, "resolved_target": resolved_target} + return { + "action": req.action, + "executed": not run_dry, + "dry_run": run_dry, + "screen": screen_selection, + "display": selected_display, + "resolved_target": resolved_target, + "click_text_match": click_text_match, + } + + +def _verify_ocr_text_near_point(spec: VerifyOCRTextNearPoint) -> dict: + radius = spec.radius + display, _, _ = select_display(spec.screen) + region_x = max(display["x"], spec.x - radius) + region_y = max(display["y"], spec.y - radius) + max_right = display["x"] + display["width"] + max_bottom = display["y"] + display["height"] + region_right = min(max_right, spec.x + radius) + region_bottom = min(max_bottom, spec.y + radius) + region_w = max(1, region_right - region_x) + region_h = max(1, region_bottom - region_y) + img, region, _, _, screen_selection = capture_region_image(spec.screen, region_x, region_y, region_w, region_h) + items = extract_ocr_items(img, region["x"], region["y"], spec.min_confidence, spec.ocr_lang, spec.ocr_psm) + matches = [item for item in items if _text_matches(item["text"], spec.text, spec.match, spec.case_sensitive)] + return {"ok": len(matches) > 0, "matches": matches[:8], "items_count": len(items), "screen": screen_selection, "region": region} + + +def execute_and_verify(req: InteractVerifyRequest) -> dict: + started = time.time() + action_result = exec_action(req.action.action, req.action.screen) + attempts = 0 + last_check = None + deadline = started + (req.timeout_ms / 1000.0) + while True: + attempts += 1 + check = _verify_ocr_text_near_point(req.verify) + last_check = check + if check["ok"]: + return { + "action_result": action_result, + "verified": True, + "attempts": attempts, + "last_check": last_check, + "duration_ms": int((time.time() - started) * 1000), + } + if time.time() >= deadline: + return { + "action_result": action_result, + "verified": False, + "attempts": attempts, + "last_check": last_check, + "duration_ms": int((time.time() - started) * 1000), + } + time.sleep(req.check_interval_ms / 1000.0) def windows_only(feature: str): diff --git a/skill/SKILL.md b/skill/SKILL.md index 9b93b05..38c14cb 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -5,21 +5,24 @@ description: Use 3 methods to control a computer: see (screenshot+grid), interac # Clickthrough Computer Control -Use exactly 3 methods: +Use these methods: - `see` - `interact` +- `interact/verify` - `exec` ## Method 1: See Use `POST /see` to capture full screen or a region with a grid overlay. Use `POST /see/zoom` to capture a tighter crop with a denser grid. +Use `POST /see` with `ocr=true` when text localization is needed. Rules: - Start with coarse grid (`12x12`). - For precision, zoom and use denser grid (`20x20` or higher). - Always use returned `meta.region` and `meta.grid` when computing click targets. - Coordinates are global desktop coordinates. +- OCR results are in `data.meta.ocr` and include confidence, bbox, and center. ## Method 2: Interact @@ -27,15 +30,26 @@ Use `POST /interact` for one action at a time. Mouse actions: - `move`, `click`, `right_click`, `double_click`, `middle_click`, `scroll` +- `click_text` (OCR-driven click; optionally scope with `click_text.region`) Keyboard actions: - `type`, `hotkey` Rules: - Prefer `grid` targets derived from fresh `see`/`see/zoom` captures. +- For text buttons/labels, prefer `click_text` and bound OCR with a region when possible. - Use `pixel` only when you already have reliable coordinates. - After each important action, call `see` again before continuing. +## Method 2.5: Action Verify + +Use `POST /interact/verify` to execute one action and immediately validate nearby OCR text state. + +Rules: +- Keep verification narrow: use `ocr_text_near_point` with a focused radius. +- Use short intervals/timeouts for speed (`check_interval_ms` ~250, `timeout_ms` 1000-3000). +- Prefer this over manual re-check loops when immediate confirmation is required. + ## Method 3: Exec Use `POST /exec` only for shell/system tasks. @@ -49,8 +63,8 @@ Rules: 1. `see` capture. 2. If needed, `see/zoom` refine. -3. `interact` one step. -4. `see` verify. +3. `interact` one step (`click_text` for text UI targets). +4. `interact/verify` for action->state confirmation, or `see` verify. 5. Repeat. ## Quick Safety Rules diff --git a/tests/test_ocr_and_interact.py b/tests/test_ocr_and_interact.py new file mode 100644 index 0000000..632f6cc --- /dev/null +++ b/tests/test_ocr_and_interact.py @@ -0,0 +1,139 @@ +import sys + +from PIL import Image +from fastapi.testclient import TestClient + +from server import services +from server.app import app +from server.models import ClickTextAction + + +def test_extract_ocr_items_normalization(monkeypatch): + class FakeOutput: + DICT = "DICT" + + class FakeTesseract: + Output = FakeOutput + + @staticmethod + def image_to_data(_image, lang, config, output_type): + assert lang == "eng" + assert output_type == "DICT" + return { + "text": ["hello", " ", "world"], + "conf": ["95.0", "-1", "62.5"], + "left": [10, 12, 40], + "top": [20, 25, 60], + "width": [30, 10, 50], + "height": [10, 10, 12], + } + + monkeypatch.setitem(sys.modules, "pytesseract", FakeTesseract) + items = services.extract_ocr_items(Image.new("RGB", (100, 100)), origin_x=100, origin_y=200, min_confidence=60, lang="eng", psm=None) + assert len(items) == 2 + assert items[0]["text"] == "hello" + assert items[0]["bbox"]["x"] == 110 + assert items[0]["center"]["y"] == 225 + assert items[1]["text"] == "world" + + +def test_resolve_text_match_contains_exact_regex_and_nth(): + items = [ + {"text": "Save", "confidence": 70}, + {"text": "Save as", "confidence": 96}, + {"text": "SAVE", "confidence": 88}, + ] + contains = services._resolve_text_match(ClickTextAction(text="save", match="contains", occurrence="first"), items) + assert contains["text"] == "Save" + best = services._resolve_text_match(ClickTextAction(text="save", match="contains", occurrence="best"), items) + assert best["text"] == "Save as" + exact_case = services._resolve_text_match( + ClickTextAction(text="SAVE", match="exact", case_sensitive=True, occurrence="first"), + items, + ) + assert exact_case["text"] == "SAVE" + regex_nth = services._resolve_text_match(ClickTextAction(text="^Save", match="regex", occurrence="nth", nth=2), items) + assert regex_nth["text"] == "Save as" + + +def test_interact_click_text_region_optional(monkeypatch): + monkeypatch.setattr(services, "select_display", lambda screen: ({"screen": screen}, [], {"requested": screen, "selected": screen, "fallback": False})) + monkeypatch.setattr( + services, + "capture_region_image", + lambda screen, x, y, w, h: (Image.new("RGB", (20, 20)), {"x": x or 0, "y": y or 0, "width": w or 20, "height": h or 20}, {}, [], {}), + ) + monkeypatch.setattr( + services, + "extract_ocr_items", + lambda *args, **kwargs: [ + { + "text": "Apply", + "confidence": 93.0, + "bbox": {"x": 10, "y": 20, "width": 20, "height": 10}, + "center": {"x": 20, "y": 25}, + "region_relative_bbox": {"x": 10, "y": 20, "width": 20, "height": 10}, + } + ], + ) + + client = TestClient(app) + response = client.post( + "/interact", + json={"screen": 0, "action": {"action": "click_text", "dry_run": True, "click_text": {"text": "Apply", "match": "contains"}}}, + ) + assert response.status_code == 200 + body = response.json()["data"] + assert body["resolved_target"]["x"] == 20 + assert body["click_text_match"]["matched"]["text"] == "Apply" + + +def test_see_ocr_off_on_contract(monkeypatch): + monkeypatch.setattr( + "server.app.capture_region_image", + lambda *args, **kwargs: (Image.new("RGB", (10, 10)), {"x": 0, "y": 0, "width": 10, "height": 10}, {"screen": 0}, [], {}), + ) + monkeypatch.setattr("server.app.encode_image", lambda *args, **kwargs: "abc") + monkeypatch.setattr("server.app.extract_ocr_items", lambda *args, **kwargs: [{"text": "x"}]) + + client = TestClient(app) + off = client.post("/see", json={"ocr": False, "with_grid": False}) + assert off.status_code == 200 + assert "ocr" not in off.json()["data"]["meta"] + on = client.post("/see", json={"ocr": True, "with_grid": False}) + assert on.status_code == 200 + assert on.json()["data"]["meta"]["ocr"][0]["text"] == "x" + + +def test_interact_verify_success_and_timeout(monkeypatch): + calls = {"n": 0} + monkeypatch.setattr(services, "exec_action", lambda action, screen=0: {"action": action.action, "executed": True}) + + def fake_verify(_spec): + calls["n"] += 1 + return {"ok": calls["n"] >= 2, "matches": [], "items_count": 1, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}} + + monkeypatch.setattr(services, "_verify_ocr_text_near_point", fake_verify) + client = TestClient(app) + payload = { + "action": {"screen": 0, "action": {"action": "type", "text": "hello"}}, + "verify": {"type": "ocr_text_near_point", "text": "hello", "x": 100, "y": 100}, + "check_interval_ms": 10, + "timeout_ms": 500, + } + ok_resp = client.post("/interact/verify", json=payload) + assert ok_resp.status_code == 200 + ok_data = ok_resp.json()["data"] + assert ok_data["verified"] is True + assert ok_data["attempts"] == 2 + + monkeypatch.setattr( + services, + "_verify_ocr_text_near_point", + lambda _spec: {"ok": False, "matches": [], "items_count": 0, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}}, + ) + timeout_resp = client.post("/interact/verify", json=payload) + assert timeout_resp.status_code == 200 + timeout_data = timeout_resp.json()["data"] + assert timeout_data["verified"] is False + assert timeout_data["attempts"] >= 1