Add pytesseract OCR, click_text interact action, and interact verify endpoint
All checks were successful
python-syntax / syntax-check (push) Successful in 6s

This commit is contained in:
2026-05-03 20:57:34 +02:00
parent 1c03cab457
commit 9e816e0417
8 changed files with 559 additions and 11 deletions

View File

@@ -26,3 +26,8 @@
- [x] Upgrade skill with verify-before-click rules, confidence thresholds, two-phase risky actions, and Spotify playbook
- [x] Add top-level skill section for instance setup + mini API docs
- [x] Clarify user-owned setup responsibilities vs agent responsibilities in skill docs
## Deferred Backlog
- [ ] Higher-level task macros composed from `see` + `interact` + `interact/verify` primitives
- [ ] Additional verify primitives beyond `ocr_text_near_point` (image-diff region, window title/process state, color/pixel checks)
- [ ] Broader API simplification pass to reduce payload overlap and consolidate shared OCR options

View File

@@ -8,9 +8,10 @@ Auth header when enabled:
x-clickthrough-token: <token>
```
This API is intended for AI computer control through 3 methods only:
This API is intended for AI computer control through these methods:
- `see`
- `interact`
- `interact/verify`
- `exec`
All responses use one envelope.
@@ -62,7 +63,11 @@ Capture a full screen or a region. Optional grid overlay returns coordinate meta
"grid_cols": 12,
"include_labels": true,
"image_format": "png",
"jpeg_quality": 85
"jpeg_quality": 85,
"ocr": false,
"ocr_min_confidence": 0,
"ocr_lang": "eng",
"ocr_psm": null
}
```
@@ -70,6 +75,14 @@ Returns:
- `data.image.base64`
- `data.meta.region` (global desktop coords)
- `data.meta.grid` (rows/cols/cell size + formula)
- `data.meta.ocr` (when `ocr=true`)
OCR item shape:
- `text`
- `confidence`
- `bbox` (global coords)
- `center`
- `region_relative_bbox`
### `POST /see/zoom`
Capture a tighter crop around a global point and draw another grid over that crop.
@@ -126,12 +139,83 @@ Supported actions:
- `scroll` (`scroll_amount`)
- `type` (`text`, `interval_ms`)
- `hotkey` (`keys`)
- `click_text` (OCR-driven text click with optional region)
Target modes:
- `pixel`: absolute global `x,y`
- `grid`: grid cell from a `see`/`see/zoom` response
## 3) Exec
### `click_text` example (full screen OCR)
```json
{
"screen": 0,
"action": {
"action": "click_text",
"click_text": {
"text": "Sign in",
"match": "contains",
"case_sensitive": false,
"min_confidence": 45,
"occurrence": "best"
}
}
}
```
### `click_text` example (region OCR)
```json
{
"screen": 0,
"action": {
"action": "click_text",
"click_text": {
"text": "Continue",
"match": "exact",
"region": { "x": 940, "y": 520, "width": 400, "height": 260 },
"occurrence": "first"
}
}
}
```
## 3) Interact Verify
### `POST /interact/verify`
Execute one interact action, then poll quick OCR verification checks until success or timeout.
```json
{
"action": {
"screen": 0,
"action": {
"action": "click_text",
"click_text": {
"text": "Apply",
"match": "contains"
}
}
},
"verify": {
"type": "ocr_text_near_point",
"text": "Applied",
"x": 1180,
"y": 640,
"radius": 120,
"screen": 0,
"match": "contains"
},
"check_interval_ms": 250,
"timeout_ms": 3000
}
```
Response includes:
- `action_result`
- `verified`
- `attempts`
- `last_check`
- `duration_ms`
## 4) Exec
### `POST /exec`
Run host shell commands (PowerShell/Bash/CMD).

View File

@@ -35,6 +35,50 @@ def main():
print("region:", payload["meta"]["region"])
print("grid:", payload["meta"].get("grid", {}))
see_ocr = requests.post(
f"{BASE_URL}/see",
headers=headers,
json={"screen": SCREEN, "ocr": True, "with_grid": False, "ocr_min_confidence": 40},
timeout=30,
)
see_ocr.raise_for_status()
ocr_items = see_ocr.json()["data"]["meta"].get("ocr", [])
print("ocr_items:", len(ocr_items))
if ocr_items:
label = ocr_items[0]["text"]
click_text = requests.post(
f"{BASE_URL}/interact",
headers=headers,
json={
"screen": SCREEN,
"action": {"action": "click_text", "click_text": {"text": label, "match": "exact", "occurrence": "first"}},
},
timeout=30,
)
click_text.raise_for_status()
click_data = click_text.json()["data"]
target = click_data["resolved_target"]
verify = requests.post(
f"{BASE_URL}/interact/verify",
headers=headers,
json={
"action": {"screen": SCREEN, "action": {"action": "click", "target": {"mode": "pixel", "x": target["x"], "y": target["y"]}}},
"verify": {
"type": "ocr_text_near_point",
"text": label,
"x": target["x"],
"y": target["y"],
"radius": 150,
"screen": SCREEN,
},
"timeout_ms": 1500,
},
timeout=30,
)
verify.raise_for_status()
print("verify:", verify.json()["data"]["verified"])
if __name__ == "__main__":
main()

View File

@@ -8,13 +8,15 @@ from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from .config import SETTINGS
from .models import ExecRequest, InteractRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery
from .models import ExecRequest, InteractRequest, InteractVerifyRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery
from .services import (
apply_window_action,
capture_region_image,
capture_screen,
draw_grid,
encode_image,
execute_and_verify,
extract_ocr_items,
exec_action,
exec_command as run_exec_command,
get_displays,
@@ -65,7 +67,8 @@ async def _http_exception_handler(_: Request, exc: HTTPException):
detail = exc.detail
if isinstance(detail, dict):
message = str(detail.get("message", "request failed"))
return _err("http_error", message, exc.status_code, detail)
code = str(detail.get("code", "http_error"))
return _err(code, message, exc.status_code, detail.get("details"))
return _err("http_error", str(detail), exc.status_code)
@@ -99,6 +102,8 @@ def see(req: SeeRequest, _: None = Depends(_auth)):
if req.with_grid:
out_img, grid_meta = draw_grid(image, region["x"], region["y"], req.grid_rows, req.grid_cols, req.include_labels)
meta.update(grid_meta)
if req.ocr:
meta["ocr"] = extract_ocr_items(image, region["x"], region["y"], req.ocr_min_confidence, req.ocr_lang, req.ocr_psm)
return _ok(
{
"image": {
@@ -154,6 +159,11 @@ def interact(req: InteractRequest, _: None = Depends(_auth)):
return _ok(exec_action(req.action, req.screen))
@app.post("/interact/verify")
def interact_verify(req: InteractVerifyRequest, _: None = Depends(_auth)):
return _ok(execute_and_verify(req))
@app.get("/health")
def health(_: None = Depends(_auth)):
return _ok(

View File

@@ -48,6 +48,7 @@ class ActionRequest(BaseModel):
"scroll",
"type",
"hotkey",
"click_text",
]
target: Optional[Target] = None
duration_ms: int = Field(default=0, ge=0, le=20000)
@@ -58,6 +59,13 @@ class ActionRequest(BaseModel):
keys: list[str] = Field(default_factory=list)
interval_ms: int = Field(default=20, ge=0, le=5000)
dry_run: bool = False
click_text: "ClickTextAction | None" = None
@model_validator(mode="after")
def _validate_click_text(self):
if self.action == "click_text" and self.click_text is None:
raise ValueError("click_text payload is required when action=click_text")
return self
class ExecRequest(BaseModel):
@@ -103,6 +111,10 @@ class SeeRequest(BaseModel):
include_labels: bool = True
image_format: Literal["png", "jpeg"] = "png"
jpeg_quality: int = Field(default=85, ge=1, le=100)
ocr: bool = False
ocr_min_confidence: float = Field(default=0.0, ge=0.0, le=100.0)
ocr_lang: str = Field(default="eng", min_length=1, max_length=64)
ocr_psm: int | None = Field(default=None, ge=0, le=13)
class SeeZoomRequest(BaseModel):
@@ -122,3 +134,55 @@ class SeeZoomRequest(BaseModel):
class InteractRequest(BaseModel):
screen: int = 0
action: ActionRequest
class OCRRegion(BaseModel):
x: int = Field(ge=0)
y: int = Field(ge=0)
width: int = Field(gt=0)
height: int = Field(gt=0)
class ClickTextAction(BaseModel):
text: str = Field(min_length=1, max_length=1000)
match: Literal["contains", "exact", "regex"] = "contains"
region: OCRRegion | None = None
screen: int | None = None
case_sensitive: bool = False
min_confidence: float = Field(default=0.0, ge=0.0, le=100.0)
occurrence: Literal["first", "best", "nth"] = "first"
nth: int | None = Field(default=None, ge=1, le=10000)
ocr_lang: str = Field(default="eng", min_length=1, max_length=64)
ocr_psm: int | None = Field(default=None, ge=0, le=13)
@model_validator(mode="after")
def _validate_nth(self):
if self.occurrence == "nth" and self.nth is None:
raise ValueError("nth is required when occurrence=nth")
if self.occurrence != "nth" and self.nth is not None:
raise ValueError("nth is only allowed when occurrence=nth")
return self
class VerifyOCRTextNearPoint(BaseModel):
type: Literal["ocr_text_near_point"]
text: str = Field(min_length=1, max_length=1000)
x: int = Field(ge=0)
y: int = Field(ge=0)
radius: int = Field(default=80, ge=1, le=1000)
screen: int = 0
match: Literal["contains", "exact", "regex"] = "contains"
case_sensitive: bool = False
min_confidence: float = Field(default=0.0, ge=0.0, le=100.0)
ocr_lang: str = Field(default="eng", min_length=1, max_length=64)
ocr_psm: int | None = Field(default=None, ge=0, le=13)
class InteractVerifyRequest(BaseModel):
action: InteractRequest
verify: VerifyOCRTextNearPoint
check_interval_ms: int = Field(default=250, ge=50, le=5000)
timeout_ms: int = Field(default=3000, ge=100, le=60000)
ActionRequest.model_rebuild()

View File

@@ -11,7 +11,22 @@ from fastapi import HTTPException
from PIL import ImageChops, ImageStat
from .config import SETTINGS
from .models import ActionRequest, GridTarget, LaunchRequest, PixelTarget, Target, WindowActionRequest, WindowQuery
from .models import (
ActionRequest,
ClickTextAction,
GridTarget,
InteractVerifyRequest,
LaunchRequest,
PixelTarget,
Target,
VerifyOCRTextNearPoint,
WindowActionRequest,
WindowQuery,
)
def api_error(status_code: int, code: str, message: str, details=None):
raise HTTPException(status_code=status_code, detail={"code": code, "message": message, "details": details})
def import_capture_libs():
@@ -85,6 +100,50 @@ def capture_region_image(screen: int, region_x: int | None, region_y: int | None
return crop, {"x": region_x, "y": region_y, "width": region_width, "height": region_height}, mon, displays, screen_selection
def extract_ocr_items(image, origin_x: int, origin_y: int, min_confidence: float, lang: str, psm: int | None) -> list[dict]:
try:
import pytesseract
except Exception as exc:
api_error(503, "ocr_unavailable", f"pytesseract unavailable: {exc}")
config = ""
if psm is not None:
config = f"--psm {psm}"
try:
data = pytesseract.image_to_data(image, lang=lang, config=config, output_type=pytesseract.Output.DICT)
except Exception as exc:
api_error(503, "ocr_failed", f"ocr failed: {exc}")
out: list[dict] = []
n = len(data.get("text", []))
for i in range(n):
text = (data["text"][i] or "").strip()
if not text:
continue
try:
confidence = float(data["conf"][i])
except Exception:
continue
if confidence < min_confidence:
continue
left = int(data["left"][i])
top = int(data["top"][i])
width = int(data["width"][i])
height = int(data["height"][i])
bbox = {"x": origin_x + left, "y": origin_y + top, "width": width, "height": height}
center = {"x": bbox["x"] + (width // 2), "y": bbox["y"] + (height // 2)}
out.append(
{
"text": text,
"confidence": confidence,
"bbox": bbox,
"center": center,
"region_relative_bbox": {"x": left, "y": top, "width": width, "height": height},
}
)
return out
def serialize_image(image, image_format: str, jpeg_quality: int) -> bytes:
buf = io.BytesIO()
if image_format == "jpeg":
@@ -164,6 +223,39 @@ def enforce_allowed_region(x: int, y: int):
raise HTTPException(status_code=403, detail="point outside allowed region")
def _text_matches(candidate: str, needle: str, mode: str, case_sensitive: bool) -> bool:
hay = candidate if case_sensitive else candidate.lower()
ndl = needle if case_sensitive else needle.lower()
if mode == "contains":
return ndl in hay
if mode == "exact":
return hay == ndl
flags = 0 if case_sensitive else re.IGNORECASE
return re.search(needle, candidate, flags=flags) is not None
def _resolve_text_match(click_text: ClickTextAction, items: list[dict]) -> dict:
matches = [item for item in items if _text_matches(item["text"], click_text.text, click_text.match, click_text.case_sensitive)]
if not matches:
candidates = [item["text"] for item in sorted(items, key=lambda v: v["confidence"], reverse=True)[:8]]
api_error(404, "ocr_text_not_found", "no OCR text matched", {"query": click_text.text, "candidates": candidates})
if click_text.occurrence == "best":
return max(matches, key=lambda item: item["confidence"])
if click_text.occurrence == "nth":
idx = (click_text.nth or 1) - 1
if idx >= len(matches):
api_error(409, "ocr_nth_out_of_range", "requested nth match is out of range", {"match_count": len(matches), "nth": click_text.nth})
return matches[idx]
if len(matches) > 1 and click_text.match == "exact":
api_error(
409,
"ocr_text_ambiguous",
"multiple OCR entries matched",
{"match_count": len(matches), "candidates": [item["text"] for item in matches[:8]]},
)
return matches[0]
def import_input_lib():
try:
import pyautogui
@@ -176,7 +268,10 @@ def import_input_lib():
def exec_action(req: ActionRequest, screen: int = 0) -> dict:
run_dry = SETTINGS["dry_run"] or req.dry_run
selected_display, _, screen_selection = select_display(screen)
action_screen = screen
if req.action == "click_text" and req.click_text and req.click_text.screen is not None:
action_screen = req.click_text.screen
selected_display, _, screen_selection = select_display(action_screen)
pyautogui = None if run_dry else import_input_lib()
resolved_target = None
@@ -191,6 +286,36 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict:
if req.action == "scroll" and resolved_target is None:
raise HTTPException(status_code=400, detail="target is required for scroll")
click_text_match = None
if req.action == "click_text":
if req.click_text is None:
api_error(400, "click_text_payload_required", "click_text payload is required")
region = req.click_text.region
img, captured_region, _, _, _ = capture_region_image(
action_screen,
None if region is None else region.x,
None if region is None else region.y,
None if region is None else region.width,
None if region is None else region.height,
)
items = extract_ocr_items(
img,
captured_region["x"],
captured_region["y"],
req.click_text.min_confidence,
req.click_text.ocr_lang,
req.click_text.ocr_psm,
)
matched = _resolve_text_match(req.click_text, items)
enforce_allowed_region(matched["center"]["x"], matched["center"]["y"])
click_text_match = {
"query": req.click_text.model_dump(),
"matched": matched,
"capture_region": captured_region,
"screen": screen_selection,
}
resolved_target = {"x": matched["center"]["x"], "y": matched["center"]["y"], "target_info": {"mode": "ocr_text"}}
if not run_dry:
if req.action == "move":
pyautogui.moveTo(resolved_target["x"], resolved_target["y"], duration=duration_sec)
@@ -211,8 +336,71 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict:
if len(req.keys) < 1:
raise HTTPException(status_code=400, detail="keys is required for hotkey")
pyautogui.hotkey(*req.keys)
elif req.action == "click_text":
pyautogui.click(
x=resolved_target["x"],
y=resolved_target["y"],
clicks=req.clicks,
interval=req.interval_ms / 1000.0,
button=req.button,
duration=duration_sec,
)
return {"action": req.action, "executed": not run_dry, "dry_run": run_dry, "screen": screen_selection, "display": selected_display, "resolved_target": resolved_target}
return {
"action": req.action,
"executed": not run_dry,
"dry_run": run_dry,
"screen": screen_selection,
"display": selected_display,
"resolved_target": resolved_target,
"click_text_match": click_text_match,
}
def _verify_ocr_text_near_point(spec: VerifyOCRTextNearPoint) -> dict:
radius = spec.radius
display, _, _ = select_display(spec.screen)
region_x = max(display["x"], spec.x - radius)
region_y = max(display["y"], spec.y - radius)
max_right = display["x"] + display["width"]
max_bottom = display["y"] + display["height"]
region_right = min(max_right, spec.x + radius)
region_bottom = min(max_bottom, spec.y + radius)
region_w = max(1, region_right - region_x)
region_h = max(1, region_bottom - region_y)
img, region, _, _, screen_selection = capture_region_image(spec.screen, region_x, region_y, region_w, region_h)
items = extract_ocr_items(img, region["x"], region["y"], spec.min_confidence, spec.ocr_lang, spec.ocr_psm)
matches = [item for item in items if _text_matches(item["text"], spec.text, spec.match, spec.case_sensitive)]
return {"ok": len(matches) > 0, "matches": matches[:8], "items_count": len(items), "screen": screen_selection, "region": region}
def execute_and_verify(req: InteractVerifyRequest) -> dict:
started = time.time()
action_result = exec_action(req.action.action, req.action.screen)
attempts = 0
last_check = None
deadline = started + (req.timeout_ms / 1000.0)
while True:
attempts += 1
check = _verify_ocr_text_near_point(req.verify)
last_check = check
if check["ok"]:
return {
"action_result": action_result,
"verified": True,
"attempts": attempts,
"last_check": last_check,
"duration_ms": int((time.time() - started) * 1000),
}
if time.time() >= deadline:
return {
"action_result": action_result,
"verified": False,
"attempts": attempts,
"last_check": last_check,
"duration_ms": int((time.time() - started) * 1000),
}
time.sleep(req.check_interval_ms / 1000.0)
def windows_only(feature: str):

View File

@@ -5,21 +5,24 @@ description: Use 3 methods to control a computer: see (screenshot+grid), interac
# Clickthrough Computer Control
Use exactly 3 methods:
Use these methods:
- `see`
- `interact`
- `interact/verify`
- `exec`
## Method 1: See
Use `POST /see` to capture full screen or a region with a grid overlay.
Use `POST /see/zoom` to capture a tighter crop with a denser grid.
Use `POST /see` with `ocr=true` when text localization is needed.
Rules:
- Start with coarse grid (`12x12`).
- For precision, zoom and use denser grid (`20x20` or higher).
- Always use returned `meta.region` and `meta.grid` when computing click targets.
- Coordinates are global desktop coordinates.
- OCR results are in `data.meta.ocr` and include confidence, bbox, and center.
## Method 2: Interact
@@ -27,15 +30,26 @@ Use `POST /interact` for one action at a time.
Mouse actions:
- `move`, `click`, `right_click`, `double_click`, `middle_click`, `scroll`
- `click_text` (OCR-driven click; optionally scope with `click_text.region`)
Keyboard actions:
- `type`, `hotkey`
Rules:
- Prefer `grid` targets derived from fresh `see`/`see/zoom` captures.
- For text buttons/labels, prefer `click_text` and bound OCR with a region when possible.
- Use `pixel` only when you already have reliable coordinates.
- After each important action, call `see` again before continuing.
## Method 2.5: Action Verify
Use `POST /interact/verify` to execute one action and immediately validate nearby OCR text state.
Rules:
- Keep verification narrow: use `ocr_text_near_point` with a focused radius.
- Use short intervals/timeouts for speed (`check_interval_ms` ~250, `timeout_ms` 1000-3000).
- Prefer this over manual re-check loops when immediate confirmation is required.
## Method 3: Exec
Use `POST /exec` only for shell/system tasks.
@@ -49,8 +63,8 @@ Rules:
1. `see` capture.
2. If needed, `see/zoom` refine.
3. `interact` one step.
4. `see` verify.
3. `interact` one step (`click_text` for text UI targets).
4. `interact/verify` for action->state confirmation, or `see` verify.
5. Repeat.
## Quick Safety Rules

View File

@@ -0,0 +1,139 @@
import sys
from PIL import Image
from fastapi.testclient import TestClient
from server import services
from server.app import app
from server.models import ClickTextAction
def test_extract_ocr_items_normalization(monkeypatch):
class FakeOutput:
DICT = "DICT"
class FakeTesseract:
Output = FakeOutput
@staticmethod
def image_to_data(_image, lang, config, output_type):
assert lang == "eng"
assert output_type == "DICT"
return {
"text": ["hello", " ", "world"],
"conf": ["95.0", "-1", "62.5"],
"left": [10, 12, 40],
"top": [20, 25, 60],
"width": [30, 10, 50],
"height": [10, 10, 12],
}
monkeypatch.setitem(sys.modules, "pytesseract", FakeTesseract)
items = services.extract_ocr_items(Image.new("RGB", (100, 100)), origin_x=100, origin_y=200, min_confidence=60, lang="eng", psm=None)
assert len(items) == 2
assert items[0]["text"] == "hello"
assert items[0]["bbox"]["x"] == 110
assert items[0]["center"]["y"] == 225
assert items[1]["text"] == "world"
def test_resolve_text_match_contains_exact_regex_and_nth():
items = [
{"text": "Save", "confidence": 70},
{"text": "Save as", "confidence": 96},
{"text": "SAVE", "confidence": 88},
]
contains = services._resolve_text_match(ClickTextAction(text="save", match="contains", occurrence="first"), items)
assert contains["text"] == "Save"
best = services._resolve_text_match(ClickTextAction(text="save", match="contains", occurrence="best"), items)
assert best["text"] == "Save as"
exact_case = services._resolve_text_match(
ClickTextAction(text="SAVE", match="exact", case_sensitive=True, occurrence="first"),
items,
)
assert exact_case["text"] == "SAVE"
regex_nth = services._resolve_text_match(ClickTextAction(text="^Save", match="regex", occurrence="nth", nth=2), items)
assert regex_nth["text"] == "Save as"
def test_interact_click_text_region_optional(monkeypatch):
monkeypatch.setattr(services, "select_display", lambda screen: ({"screen": screen}, [], {"requested": screen, "selected": screen, "fallback": False}))
monkeypatch.setattr(
services,
"capture_region_image",
lambda screen, x, y, w, h: (Image.new("RGB", (20, 20)), {"x": x or 0, "y": y or 0, "width": w or 20, "height": h or 20}, {}, [], {}),
)
monkeypatch.setattr(
services,
"extract_ocr_items",
lambda *args, **kwargs: [
{
"text": "Apply",
"confidence": 93.0,
"bbox": {"x": 10, "y": 20, "width": 20, "height": 10},
"center": {"x": 20, "y": 25},
"region_relative_bbox": {"x": 10, "y": 20, "width": 20, "height": 10},
}
],
)
client = TestClient(app)
response = client.post(
"/interact",
json={"screen": 0, "action": {"action": "click_text", "dry_run": True, "click_text": {"text": "Apply", "match": "contains"}}},
)
assert response.status_code == 200
body = response.json()["data"]
assert body["resolved_target"]["x"] == 20
assert body["click_text_match"]["matched"]["text"] == "Apply"
def test_see_ocr_off_on_contract(monkeypatch):
monkeypatch.setattr(
"server.app.capture_region_image",
lambda *args, **kwargs: (Image.new("RGB", (10, 10)), {"x": 0, "y": 0, "width": 10, "height": 10}, {"screen": 0}, [], {}),
)
monkeypatch.setattr("server.app.encode_image", lambda *args, **kwargs: "abc")
monkeypatch.setattr("server.app.extract_ocr_items", lambda *args, **kwargs: [{"text": "x"}])
client = TestClient(app)
off = client.post("/see", json={"ocr": False, "with_grid": False})
assert off.status_code == 200
assert "ocr" not in off.json()["data"]["meta"]
on = client.post("/see", json={"ocr": True, "with_grid": False})
assert on.status_code == 200
assert on.json()["data"]["meta"]["ocr"][0]["text"] == "x"
def test_interact_verify_success_and_timeout(monkeypatch):
calls = {"n": 0}
monkeypatch.setattr(services, "exec_action", lambda action, screen=0: {"action": action.action, "executed": True})
def fake_verify(_spec):
calls["n"] += 1
return {"ok": calls["n"] >= 2, "matches": [], "items_count": 1, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}}
monkeypatch.setattr(services, "_verify_ocr_text_near_point", fake_verify)
client = TestClient(app)
payload = {
"action": {"screen": 0, "action": {"action": "type", "text": "hello"}},
"verify": {"type": "ocr_text_near_point", "text": "hello", "x": 100, "y": 100},
"check_interval_ms": 10,
"timeout_ms": 500,
}
ok_resp = client.post("/interact/verify", json=payload)
assert ok_resp.status_code == 200
ok_data = ok_resp.json()["data"]
assert ok_data["verified"] is True
assert ok_data["attempts"] == 2
monkeypatch.setattr(
services,
"_verify_ocr_text_near_point",
lambda _spec: {"ok": False, "matches": [], "items_count": 0, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}},
)
timeout_resp = client.post("/interact/verify", json=payload)
assert timeout_resp.status_code == 200
timeout_data = timeout_resp.json()["data"]
assert timeout_data["verified"] is False
assert timeout_data["attempts"] >= 1