Remove interact verify endpoint
All checks were successful
python-syntax / syntax-check (push) Successful in 31s

This commit is contained in:
Space-Banane
2026-05-04 15:59:43 +02:00
parent f05e0c56e6
commit 22ca0097d1
8 changed files with 6 additions and 182 deletions

View File

@@ -28,8 +28,8 @@
- [x] Clarify user-owned setup responsibilities vs agent responsibilities in skill docs
## Deferred Backlog (Prioritized)
1. [ ] Higher-level automation macros composed from `see` + `interact` + `interact/verify`
1. [ ] Higher-level automation macros composed from `see` + `interact`
2. [ ] Reusable workflow templates (for example: find text -> zoom fallback -> click -> verify)
3. [ ] Batch-safe orchestration primitives with explicit per-step results and auditability
4. [ ] Additional verify primitives beyond `ocr_text_near_point` (image diff region, window title/process state, color/pixel checks)
4. [ ] Additional verify primitives for post-action validation (image diff region, window title/process state, color/pixel checks)
5. [ ] Broader API simplification pass to reduce payload overlap and consolidate shared OCR options

View File

@@ -11,7 +11,6 @@ x-clickthrough-token: <token>
This API is intended for AI computer control through these methods:
- `see`
- `interact`
- `interact/verify`
- `exec`
All responses use one envelope.
@@ -178,44 +177,7 @@ Target modes:
}
```
## 3) Interact Verify
### `POST /interact/verify`
Execute one interact action, then poll quick OCR verification checks until success or timeout.
```json
{
"action": {
"screen": 0,
"action": {
"action": "click_text",
"click_text": {
"text": "Apply",
"match": "contains"
}
}
},
"verify": {
"type": "ocr_text_near_point",
"text": "Applied",
"x": 1180,
"y": 640,
"radius": 120,
"screen": 0,
"match": "contains"
},
"check_interval_ms": 250,
"timeout_ms": 3000
}
```
Response includes:
- `action_result`
- `verified`
- `attempts`
- `last_check`
- `duration_ms`
## 4) Exec
## 3) Exec
### `POST /exec`
Run host shell commands (PowerShell/Bash/CMD).

View File

@@ -58,26 +58,7 @@ def main():
)
click_text.raise_for_status()
click_data = click_text.json()["data"]
target = click_data["resolved_target"]
verify = requests.post(
f"{BASE_URL}/interact/verify",
headers=headers,
json={
"action": {"screen": SCREEN, "action": {"action": "click", "target": {"mode": "pixel", "x": target["x"], "y": target["y"]}}},
"verify": {
"type": "ocr_text_near_point",
"text": label,
"x": target["x"],
"y": target["y"],
"radius": 150,
"screen": SCREEN,
},
"timeout_ms": 1500,
},
timeout=30,
)
verify.raise_for_status()
print("verify:", verify.json()["data"]["verified"])
print("clicked:", click_data["resolved_target"])
if __name__ == "__main__":

View File

@@ -8,14 +8,13 @@ from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from .config import SETTINGS
from .models import ExecRequest, InteractRequest, InteractVerifyRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery
from .models import ExecRequest, InteractRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery
from .services import (
apply_window_action,
capture_region_image,
capture_screen,
draw_grid,
encode_image,
execute_and_verify,
extract_ocr_items,
exec_action,
exec_command as run_exec_command,
@@ -159,11 +158,6 @@ def interact(req: InteractRequest, _: None = Depends(_auth)):
return _ok(exec_action(req.action, req.screen))
@app.post("/interact/verify")
def interact_verify(req: InteractVerifyRequest, _: None = Depends(_auth)):
return _ok(execute_and_verify(req))
@app.get("/health")
def health(_: None = Depends(_auth)):
return _ok(

View File

@@ -164,25 +164,4 @@ class ClickTextAction(BaseModel):
return self
class VerifyOCRTextNearPoint(BaseModel):
type: Literal["ocr_text_near_point"]
text: str = Field(min_length=1, max_length=1000)
x: int = Field(ge=0)
y: int = Field(ge=0)
radius: int = Field(default=80, ge=1, le=1000)
screen: int = 0
match: Literal["contains", "exact", "regex"] = "contains"
case_sensitive: bool = False
min_confidence: float = Field(default=0.0, ge=0.0, le=100.0)
ocr_lang: str = Field(default="eng", min_length=1, max_length=64)
ocr_psm: int | None = Field(default=None, ge=0, le=13)
class InteractVerifyRequest(BaseModel):
action: InteractRequest
verify: VerifyOCRTextNearPoint
check_interval_ms: int = Field(default=250, ge=50, le=5000)
timeout_ms: int = Field(default=3000, ge=100, le=60000)
ActionRequest.model_rebuild()

View File

@@ -15,11 +15,9 @@ from .models import (
ActionRequest,
ClickTextAction,
GridTarget,
InteractVerifyRequest,
LaunchRequest,
PixelTarget,
Target,
VerifyOCRTextNearPoint,
WindowActionRequest,
WindowQuery,
)
@@ -357,52 +355,6 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict:
}
def _verify_ocr_text_near_point(spec: VerifyOCRTextNearPoint) -> dict:
radius = spec.radius
display, _, _ = select_display(spec.screen)
region_x = max(display["x"], spec.x - radius)
region_y = max(display["y"], spec.y - radius)
max_right = display["x"] + display["width"]
max_bottom = display["y"] + display["height"]
region_right = min(max_right, spec.x + radius)
region_bottom = min(max_bottom, spec.y + radius)
region_w = max(1, region_right - region_x)
region_h = max(1, region_bottom - region_y)
img, region, _, _, screen_selection = capture_region_image(spec.screen, region_x, region_y, region_w, region_h)
items = extract_ocr_items(img, region["x"], region["y"], spec.min_confidence, spec.ocr_lang, spec.ocr_psm)
matches = [item for item in items if _text_matches(item["text"], spec.text, spec.match, spec.case_sensitive)]
return {"ok": len(matches) > 0, "matches": matches[:8], "items_count": len(items), "screen": screen_selection, "region": region}
def execute_and_verify(req: InteractVerifyRequest) -> dict:
started = time.time()
action_result = exec_action(req.action.action, req.action.screen)
attempts = 0
last_check = None
deadline = started + (req.timeout_ms / 1000.0)
while True:
attempts += 1
check = _verify_ocr_text_near_point(req.verify)
last_check = check
if check["ok"]:
return {
"action_result": action_result,
"verified": True,
"attempts": attempts,
"last_check": last_check,
"duration_ms": int((time.time() - started) * 1000),
}
if time.time() >= deadline:
return {
"action_result": action_result,
"verified": False,
"attempts": attempts,
"last_check": last_check,
"duration_ms": int((time.time() - started) * 1000),
}
time.sleep(req.check_interval_ms / 1000.0)
def windows_only(feature: str):
if sys.platform != "win32":
raise HTTPException(status_code=501, detail=f"{feature} is currently supported on Windows hosts only")

View File

@@ -8,7 +8,6 @@ description: Use 3 methods to control a computer: see (screenshot+grid), interac
Use these methods:
- `see`
- `interact`
- `interact/verify`
- `exec`
## Method 1: See
@@ -41,15 +40,6 @@ Rules:
- Use `pixel` only when you already have reliable coordinates.
- After each important action, call `see` again before continuing.
## Method 2.5: Action Verify
Use `POST /interact/verify` to execute one action and immediately validate nearby OCR text state.
Rules:
- Keep verification narrow: use `ocr_text_near_point` with a focused radius.
- Use short intervals/timeouts for speed (`check_interval_ms` ~250, `timeout_ms` 1000-3000).
- Prefer this over manual re-check loops when immediate confirmation is required.
## Method 3: Exec
Use `POST /exec` only for shell/system tasks.
@@ -64,7 +54,7 @@ Rules:
1. `see` capture.
2. If needed, `see/zoom` refine.
3. `interact` one step (`click_text` for text UI targets).
4. `interact/verify` for action->state confirmation, or `see` verify.
4. `see` verify.
5. Repeat.
## Quick Safety Rules

View File

@@ -112,37 +112,3 @@ def test_see_ocr_off_on_contract(monkeypatch):
on = client.post("/see", json={"ocr": True, "with_grid": False}, headers=_auth_headers())
assert on.status_code == 200
assert on.json()["data"]["meta"]["ocr"][0]["text"] == "x"
def test_interact_verify_success_and_timeout(monkeypatch):
calls = {"n": 0}
monkeypatch.setattr(services, "exec_action", lambda action, screen=0: {"action": action.action, "executed": True})
def fake_verify(_spec):
calls["n"] += 1
return {"ok": calls["n"] >= 2, "matches": [], "items_count": 1, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}}
monkeypatch.setattr(services, "_verify_ocr_text_near_point", fake_verify)
client = TestClient(app)
payload = {
"action": {"screen": 0, "action": {"action": "type", "text": "hello"}},
"verify": {"type": "ocr_text_near_point", "text": "hello", "x": 100, "y": 100},
"check_interval_ms": 50,
"timeout_ms": 500,
}
ok_resp = client.post("/interact/verify", json=payload, headers=_auth_headers())
assert ok_resp.status_code == 200
ok_data = ok_resp.json()["data"]
assert ok_data["verified"] is True
assert ok_data["attempts"] == 2
monkeypatch.setattr(
services,
"_verify_ocr_text_near_point",
lambda _spec: {"ok": False, "matches": [], "items_count": 0, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}},
)
timeout_resp = client.post("/interact/verify", json=payload, headers=_auth_headers())
assert timeout_resp.status_code == 200
timeout_data = timeout_resp.json()["data"]
assert timeout_data["verified"] is False
assert timeout_data["attempts"] >= 1