Remove interact verify endpoint
All checks were successful
python-syntax / syntax-check (push) Successful in 31s
All checks were successful
python-syntax / syntax-check (push) Successful in 31s
This commit is contained in:
4
TODO.md
4
TODO.md
@@ -28,8 +28,8 @@
|
|||||||
- [x] Clarify user-owned setup responsibilities vs agent responsibilities in skill docs
|
- [x] Clarify user-owned setup responsibilities vs agent responsibilities in skill docs
|
||||||
|
|
||||||
## Deferred Backlog (Prioritized)
|
## Deferred Backlog (Prioritized)
|
||||||
1. [ ] Higher-level automation macros composed from `see` + `interact` + `interact/verify`
|
1. [ ] Higher-level automation macros composed from `see` + `interact`
|
||||||
2. [ ] Reusable workflow templates (for example: find text -> zoom fallback -> click -> verify)
|
2. [ ] Reusable workflow templates (for example: find text -> zoom fallback -> click -> verify)
|
||||||
3. [ ] Batch-safe orchestration primitives with explicit per-step results and auditability
|
3. [ ] Batch-safe orchestration primitives with explicit per-step results and auditability
|
||||||
4. [ ] Additional verify primitives beyond `ocr_text_near_point` (image diff region, window title/process state, color/pixel checks)
|
4. [ ] Additional verify primitives for post-action validation (image diff region, window title/process state, color/pixel checks)
|
||||||
5. [ ] Broader API simplification pass to reduce payload overlap and consolidate shared OCR options
|
5. [ ] Broader API simplification pass to reduce payload overlap and consolidate shared OCR options
|
||||||
|
|||||||
40
docs/API.md
40
docs/API.md
@@ -11,7 +11,6 @@ x-clickthrough-token: <token>
|
|||||||
This API is intended for AI computer control through these methods:
|
This API is intended for AI computer control through these methods:
|
||||||
- `see`
|
- `see`
|
||||||
- `interact`
|
- `interact`
|
||||||
- `interact/verify`
|
|
||||||
- `exec`
|
- `exec`
|
||||||
|
|
||||||
All responses use one envelope.
|
All responses use one envelope.
|
||||||
@@ -178,44 +177,7 @@ Target modes:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## 3) Interact Verify
|
## 3) Exec
|
||||||
|
|
||||||
### `POST /interact/verify`
|
|
||||||
Execute one interact action, then poll quick OCR verification checks until success or timeout.
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"action": {
|
|
||||||
"screen": 0,
|
|
||||||
"action": {
|
|
||||||
"action": "click_text",
|
|
||||||
"click_text": {
|
|
||||||
"text": "Apply",
|
|
||||||
"match": "contains"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"verify": {
|
|
||||||
"type": "ocr_text_near_point",
|
|
||||||
"text": "Applied",
|
|
||||||
"x": 1180,
|
|
||||||
"y": 640,
|
|
||||||
"radius": 120,
|
|
||||||
"screen": 0,
|
|
||||||
"match": "contains"
|
|
||||||
},
|
|
||||||
"check_interval_ms": 250,
|
|
||||||
"timeout_ms": 3000
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Response includes:
|
|
||||||
- `action_result`
|
|
||||||
- `verified`
|
|
||||||
- `attempts`
|
|
||||||
- `last_check`
|
|
||||||
- `duration_ms`
|
|
||||||
## 4) Exec
|
|
||||||
|
|
||||||
### `POST /exec`
|
### `POST /exec`
|
||||||
Run host shell commands (PowerShell/Bash/CMD).
|
Run host shell commands (PowerShell/Bash/CMD).
|
||||||
|
|||||||
@@ -58,26 +58,7 @@ def main():
|
|||||||
)
|
)
|
||||||
click_text.raise_for_status()
|
click_text.raise_for_status()
|
||||||
click_data = click_text.json()["data"]
|
click_data = click_text.json()["data"]
|
||||||
target = click_data["resolved_target"]
|
print("clicked:", click_data["resolved_target"])
|
||||||
verify = requests.post(
|
|
||||||
f"{BASE_URL}/interact/verify",
|
|
||||||
headers=headers,
|
|
||||||
json={
|
|
||||||
"action": {"screen": SCREEN, "action": {"action": "click", "target": {"mode": "pixel", "x": target["x"], "y": target["y"]}}},
|
|
||||||
"verify": {
|
|
||||||
"type": "ocr_text_near_point",
|
|
||||||
"text": label,
|
|
||||||
"x": target["x"],
|
|
||||||
"y": target["y"],
|
|
||||||
"radius": 150,
|
|
||||||
"screen": SCREEN,
|
|
||||||
},
|
|
||||||
"timeout_ms": 1500,
|
|
||||||
},
|
|
||||||
timeout=30,
|
|
||||||
)
|
|
||||||
verify.raise_for_status()
|
|
||||||
print("verify:", verify.json()["data"]["verified"])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -8,14 +8,13 @@ from fastapi.exceptions import RequestValidationError
|
|||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
from .config import SETTINGS
|
from .config import SETTINGS
|
||||||
from .models import ExecRequest, InteractRequest, InteractVerifyRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery
|
from .models import ExecRequest, InteractRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery
|
||||||
from .services import (
|
from .services import (
|
||||||
apply_window_action,
|
apply_window_action,
|
||||||
capture_region_image,
|
capture_region_image,
|
||||||
capture_screen,
|
capture_screen,
|
||||||
draw_grid,
|
draw_grid,
|
||||||
encode_image,
|
encode_image,
|
||||||
execute_and_verify,
|
|
||||||
extract_ocr_items,
|
extract_ocr_items,
|
||||||
exec_action,
|
exec_action,
|
||||||
exec_command as run_exec_command,
|
exec_command as run_exec_command,
|
||||||
@@ -159,11 +158,6 @@ def interact(req: InteractRequest, _: None = Depends(_auth)):
|
|||||||
return _ok(exec_action(req.action, req.screen))
|
return _ok(exec_action(req.action, req.screen))
|
||||||
|
|
||||||
|
|
||||||
@app.post("/interact/verify")
|
|
||||||
def interact_verify(req: InteractVerifyRequest, _: None = Depends(_auth)):
|
|
||||||
return _ok(execute_and_verify(req))
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
def health(_: None = Depends(_auth)):
|
def health(_: None = Depends(_auth)):
|
||||||
return _ok(
|
return _ok(
|
||||||
|
|||||||
@@ -164,25 +164,4 @@ class ClickTextAction(BaseModel):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
class VerifyOCRTextNearPoint(BaseModel):
|
|
||||||
type: Literal["ocr_text_near_point"]
|
|
||||||
text: str = Field(min_length=1, max_length=1000)
|
|
||||||
x: int = Field(ge=0)
|
|
||||||
y: int = Field(ge=0)
|
|
||||||
radius: int = Field(default=80, ge=1, le=1000)
|
|
||||||
screen: int = 0
|
|
||||||
match: Literal["contains", "exact", "regex"] = "contains"
|
|
||||||
case_sensitive: bool = False
|
|
||||||
min_confidence: float = Field(default=0.0, ge=0.0, le=100.0)
|
|
||||||
ocr_lang: str = Field(default="eng", min_length=1, max_length=64)
|
|
||||||
ocr_psm: int | None = Field(default=None, ge=0, le=13)
|
|
||||||
|
|
||||||
|
|
||||||
class InteractVerifyRequest(BaseModel):
|
|
||||||
action: InteractRequest
|
|
||||||
verify: VerifyOCRTextNearPoint
|
|
||||||
check_interval_ms: int = Field(default=250, ge=50, le=5000)
|
|
||||||
timeout_ms: int = Field(default=3000, ge=100, le=60000)
|
|
||||||
|
|
||||||
|
|
||||||
ActionRequest.model_rebuild()
|
ActionRequest.model_rebuild()
|
||||||
|
|||||||
@@ -15,11 +15,9 @@ from .models import (
|
|||||||
ActionRequest,
|
ActionRequest,
|
||||||
ClickTextAction,
|
ClickTextAction,
|
||||||
GridTarget,
|
GridTarget,
|
||||||
InteractVerifyRequest,
|
|
||||||
LaunchRequest,
|
LaunchRequest,
|
||||||
PixelTarget,
|
PixelTarget,
|
||||||
Target,
|
Target,
|
||||||
VerifyOCRTextNearPoint,
|
|
||||||
WindowActionRequest,
|
WindowActionRequest,
|
||||||
WindowQuery,
|
WindowQuery,
|
||||||
)
|
)
|
||||||
@@ -357,52 +355,6 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict:
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _verify_ocr_text_near_point(spec: VerifyOCRTextNearPoint) -> dict:
|
|
||||||
radius = spec.radius
|
|
||||||
display, _, _ = select_display(spec.screen)
|
|
||||||
region_x = max(display["x"], spec.x - radius)
|
|
||||||
region_y = max(display["y"], spec.y - radius)
|
|
||||||
max_right = display["x"] + display["width"]
|
|
||||||
max_bottom = display["y"] + display["height"]
|
|
||||||
region_right = min(max_right, spec.x + radius)
|
|
||||||
region_bottom = min(max_bottom, spec.y + radius)
|
|
||||||
region_w = max(1, region_right - region_x)
|
|
||||||
region_h = max(1, region_bottom - region_y)
|
|
||||||
img, region, _, _, screen_selection = capture_region_image(spec.screen, region_x, region_y, region_w, region_h)
|
|
||||||
items = extract_ocr_items(img, region["x"], region["y"], spec.min_confidence, spec.ocr_lang, spec.ocr_psm)
|
|
||||||
matches = [item for item in items if _text_matches(item["text"], spec.text, spec.match, spec.case_sensitive)]
|
|
||||||
return {"ok": len(matches) > 0, "matches": matches[:8], "items_count": len(items), "screen": screen_selection, "region": region}
|
|
||||||
|
|
||||||
|
|
||||||
def execute_and_verify(req: InteractVerifyRequest) -> dict:
|
|
||||||
started = time.time()
|
|
||||||
action_result = exec_action(req.action.action, req.action.screen)
|
|
||||||
attempts = 0
|
|
||||||
last_check = None
|
|
||||||
deadline = started + (req.timeout_ms / 1000.0)
|
|
||||||
while True:
|
|
||||||
attempts += 1
|
|
||||||
check = _verify_ocr_text_near_point(req.verify)
|
|
||||||
last_check = check
|
|
||||||
if check["ok"]:
|
|
||||||
return {
|
|
||||||
"action_result": action_result,
|
|
||||||
"verified": True,
|
|
||||||
"attempts": attempts,
|
|
||||||
"last_check": last_check,
|
|
||||||
"duration_ms": int((time.time() - started) * 1000),
|
|
||||||
}
|
|
||||||
if time.time() >= deadline:
|
|
||||||
return {
|
|
||||||
"action_result": action_result,
|
|
||||||
"verified": False,
|
|
||||||
"attempts": attempts,
|
|
||||||
"last_check": last_check,
|
|
||||||
"duration_ms": int((time.time() - started) * 1000),
|
|
||||||
}
|
|
||||||
time.sleep(req.check_interval_ms / 1000.0)
|
|
||||||
|
|
||||||
|
|
||||||
def windows_only(feature: str):
|
def windows_only(feature: str):
|
||||||
if sys.platform != "win32":
|
if sys.platform != "win32":
|
||||||
raise HTTPException(status_code=501, detail=f"{feature} is currently supported on Windows hosts only")
|
raise HTTPException(status_code=501, detail=f"{feature} is currently supported on Windows hosts only")
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ description: Use 3 methods to control a computer: see (screenshot+grid), interac
|
|||||||
Use these methods:
|
Use these methods:
|
||||||
- `see`
|
- `see`
|
||||||
- `interact`
|
- `interact`
|
||||||
- `interact/verify`
|
|
||||||
- `exec`
|
- `exec`
|
||||||
|
|
||||||
## Method 1: See
|
## Method 1: See
|
||||||
@@ -41,15 +40,6 @@ Rules:
|
|||||||
- Use `pixel` only when you already have reliable coordinates.
|
- Use `pixel` only when you already have reliable coordinates.
|
||||||
- After each important action, call `see` again before continuing.
|
- After each important action, call `see` again before continuing.
|
||||||
|
|
||||||
## Method 2.5: Action Verify
|
|
||||||
|
|
||||||
Use `POST /interact/verify` to execute one action and immediately validate nearby OCR text state.
|
|
||||||
|
|
||||||
Rules:
|
|
||||||
- Keep verification narrow: use `ocr_text_near_point` with a focused radius.
|
|
||||||
- Use short intervals/timeouts for speed (`check_interval_ms` ~250, `timeout_ms` 1000-3000).
|
|
||||||
- Prefer this over manual re-check loops when immediate confirmation is required.
|
|
||||||
|
|
||||||
## Method 3: Exec
|
## Method 3: Exec
|
||||||
|
|
||||||
Use `POST /exec` only for shell/system tasks.
|
Use `POST /exec` only for shell/system tasks.
|
||||||
@@ -64,7 +54,7 @@ Rules:
|
|||||||
1. `see` capture.
|
1. `see` capture.
|
||||||
2. If needed, `see/zoom` refine.
|
2. If needed, `see/zoom` refine.
|
||||||
3. `interact` one step (`click_text` for text UI targets).
|
3. `interact` one step (`click_text` for text UI targets).
|
||||||
4. `interact/verify` for action->state confirmation, or `see` verify.
|
4. `see` verify.
|
||||||
5. Repeat.
|
5. Repeat.
|
||||||
|
|
||||||
## Quick Safety Rules
|
## Quick Safety Rules
|
||||||
|
|||||||
@@ -112,37 +112,3 @@ def test_see_ocr_off_on_contract(monkeypatch):
|
|||||||
on = client.post("/see", json={"ocr": True, "with_grid": False}, headers=_auth_headers())
|
on = client.post("/see", json={"ocr": True, "with_grid": False}, headers=_auth_headers())
|
||||||
assert on.status_code == 200
|
assert on.status_code == 200
|
||||||
assert on.json()["data"]["meta"]["ocr"][0]["text"] == "x"
|
assert on.json()["data"]["meta"]["ocr"][0]["text"] == "x"
|
||||||
|
|
||||||
|
|
||||||
def test_interact_verify_success_and_timeout(monkeypatch):
|
|
||||||
calls = {"n": 0}
|
|
||||||
monkeypatch.setattr(services, "exec_action", lambda action, screen=0: {"action": action.action, "executed": True})
|
|
||||||
|
|
||||||
def fake_verify(_spec):
|
|
||||||
calls["n"] += 1
|
|
||||||
return {"ok": calls["n"] >= 2, "matches": [], "items_count": 1, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}}
|
|
||||||
|
|
||||||
monkeypatch.setattr(services, "_verify_ocr_text_near_point", fake_verify)
|
|
||||||
client = TestClient(app)
|
|
||||||
payload = {
|
|
||||||
"action": {"screen": 0, "action": {"action": "type", "text": "hello"}},
|
|
||||||
"verify": {"type": "ocr_text_near_point", "text": "hello", "x": 100, "y": 100},
|
|
||||||
"check_interval_ms": 50,
|
|
||||||
"timeout_ms": 500,
|
|
||||||
}
|
|
||||||
ok_resp = client.post("/interact/verify", json=payload, headers=_auth_headers())
|
|
||||||
assert ok_resp.status_code == 200
|
|
||||||
ok_data = ok_resp.json()["data"]
|
|
||||||
assert ok_data["verified"] is True
|
|
||||||
assert ok_data["attempts"] == 2
|
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
services,
|
|
||||||
"_verify_ocr_text_near_point",
|
|
||||||
lambda _spec: {"ok": False, "matches": [], "items_count": 0, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}},
|
|
||||||
)
|
|
||||||
timeout_resp = client.post("/interact/verify", json=payload, headers=_auth_headers())
|
|
||||||
assert timeout_resp.status_code == 200
|
|
||||||
timeout_data = timeout_resp.json()["data"]
|
|
||||||
assert timeout_data["verified"] is False
|
|
||||||
assert timeout_data["attempts"] >= 1
|
|
||||||
|
|||||||
Reference in New Issue
Block a user