Add pytesseract OCR, click_text interact action, and interact verify endpoint
All checks were successful
python-syntax / syntax-check (push) Successful in 6s
All checks were successful
python-syntax / syntax-check (push) Successful in 6s
This commit is contained in:
5
TODO.md
5
TODO.md
@@ -26,3 +26,8 @@
|
|||||||
- [x] Upgrade skill with verify-before-click rules, confidence thresholds, two-phase risky actions, and Spotify playbook
|
- [x] Upgrade skill with verify-before-click rules, confidence thresholds, two-phase risky actions, and Spotify playbook
|
||||||
- [x] Add top-level skill section for instance setup + mini API docs
|
- [x] Add top-level skill section for instance setup + mini API docs
|
||||||
- [x] Clarify user-owned setup responsibilities vs agent responsibilities in skill docs
|
- [x] Clarify user-owned setup responsibilities vs agent responsibilities in skill docs
|
||||||
|
|
||||||
|
## Deferred Backlog
|
||||||
|
- [ ] Higher-level task macros composed from `see` + `interact` + `interact/verify` primitives
|
||||||
|
- [ ] Additional verify primitives beyond `ocr_text_near_point` (image-diff region, window title/process state, color/pixel checks)
|
||||||
|
- [ ] Broader API simplification pass to reduce payload overlap and consolidate shared OCR options
|
||||||
|
|||||||
90
docs/API.md
90
docs/API.md
@@ -8,9 +8,10 @@ Auth header when enabled:
|
|||||||
x-clickthrough-token: <token>
|
x-clickthrough-token: <token>
|
||||||
```
|
```
|
||||||
|
|
||||||
This API is intended for AI computer control through 3 methods only:
|
This API is intended for AI computer control through these methods:
|
||||||
- `see`
|
- `see`
|
||||||
- `interact`
|
- `interact`
|
||||||
|
- `interact/verify`
|
||||||
- `exec`
|
- `exec`
|
||||||
|
|
||||||
All responses use one envelope.
|
All responses use one envelope.
|
||||||
@@ -62,7 +63,11 @@ Capture a full screen or a region. Optional grid overlay returns coordinate meta
|
|||||||
"grid_cols": 12,
|
"grid_cols": 12,
|
||||||
"include_labels": true,
|
"include_labels": true,
|
||||||
"image_format": "png",
|
"image_format": "png",
|
||||||
"jpeg_quality": 85
|
"jpeg_quality": 85,
|
||||||
|
"ocr": false,
|
||||||
|
"ocr_min_confidence": 0,
|
||||||
|
"ocr_lang": "eng",
|
||||||
|
"ocr_psm": null
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -70,6 +75,14 @@ Returns:
|
|||||||
- `data.image.base64`
|
- `data.image.base64`
|
||||||
- `data.meta.region` (global desktop coords)
|
- `data.meta.region` (global desktop coords)
|
||||||
- `data.meta.grid` (rows/cols/cell size + formula)
|
- `data.meta.grid` (rows/cols/cell size + formula)
|
||||||
|
- `data.meta.ocr` (when `ocr=true`)
|
||||||
|
|
||||||
|
OCR item shape:
|
||||||
|
- `text`
|
||||||
|
- `confidence`
|
||||||
|
- `bbox` (global coords)
|
||||||
|
- `center`
|
||||||
|
- `region_relative_bbox`
|
||||||
|
|
||||||
### `POST /see/zoom`
|
### `POST /see/zoom`
|
||||||
Capture a tighter crop around a global point and draw another grid over that crop.
|
Capture a tighter crop around a global point and draw another grid over that crop.
|
||||||
@@ -126,12 +139,83 @@ Supported actions:
|
|||||||
- `scroll` (`scroll_amount`)
|
- `scroll` (`scroll_amount`)
|
||||||
- `type` (`text`, `interval_ms`)
|
- `type` (`text`, `interval_ms`)
|
||||||
- `hotkey` (`keys`)
|
- `hotkey` (`keys`)
|
||||||
|
- `click_text` (OCR-driven text click with optional region)
|
||||||
|
|
||||||
Target modes:
|
Target modes:
|
||||||
- `pixel`: absolute global `x,y`
|
- `pixel`: absolute global `x,y`
|
||||||
- `grid`: grid cell from a `see`/`see/zoom` response
|
- `grid`: grid cell from a `see`/`see/zoom` response
|
||||||
|
|
||||||
## 3) Exec
|
### `click_text` example (full screen OCR)
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"screen": 0,
|
||||||
|
"action": {
|
||||||
|
"action": "click_text",
|
||||||
|
"click_text": {
|
||||||
|
"text": "Sign in",
|
||||||
|
"match": "contains",
|
||||||
|
"case_sensitive": false,
|
||||||
|
"min_confidence": 45,
|
||||||
|
"occurrence": "best"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `click_text` example (region OCR)
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"screen": 0,
|
||||||
|
"action": {
|
||||||
|
"action": "click_text",
|
||||||
|
"click_text": {
|
||||||
|
"text": "Continue",
|
||||||
|
"match": "exact",
|
||||||
|
"region": { "x": 940, "y": 520, "width": 400, "height": 260 },
|
||||||
|
"occurrence": "first"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## 3) Interact Verify
|
||||||
|
|
||||||
|
### `POST /interact/verify`
|
||||||
|
Execute one interact action, then poll quick OCR verification checks until success or timeout.
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"action": {
|
||||||
|
"screen": 0,
|
||||||
|
"action": {
|
||||||
|
"action": "click_text",
|
||||||
|
"click_text": {
|
||||||
|
"text": "Apply",
|
||||||
|
"match": "contains"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"verify": {
|
||||||
|
"type": "ocr_text_near_point",
|
||||||
|
"text": "Applied",
|
||||||
|
"x": 1180,
|
||||||
|
"y": 640,
|
||||||
|
"radius": 120,
|
||||||
|
"screen": 0,
|
||||||
|
"match": "contains"
|
||||||
|
},
|
||||||
|
"check_interval_ms": 250,
|
||||||
|
"timeout_ms": 3000
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Response includes:
|
||||||
|
- `action_result`
|
||||||
|
- `verified`
|
||||||
|
- `attempts`
|
||||||
|
- `last_check`
|
||||||
|
- `duration_ms`
|
||||||
|
## 4) Exec
|
||||||
|
|
||||||
### `POST /exec`
|
### `POST /exec`
|
||||||
Run host shell commands (PowerShell/Bash/CMD).
|
Run host shell commands (PowerShell/Bash/CMD).
|
||||||
|
|||||||
@@ -35,6 +35,50 @@ def main():
|
|||||||
print("region:", payload["meta"]["region"])
|
print("region:", payload["meta"]["region"])
|
||||||
print("grid:", payload["meta"].get("grid", {}))
|
print("grid:", payload["meta"].get("grid", {}))
|
||||||
|
|
||||||
|
see_ocr = requests.post(
|
||||||
|
f"{BASE_URL}/see",
|
||||||
|
headers=headers,
|
||||||
|
json={"screen": SCREEN, "ocr": True, "with_grid": False, "ocr_min_confidence": 40},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
see_ocr.raise_for_status()
|
||||||
|
ocr_items = see_ocr.json()["data"]["meta"].get("ocr", [])
|
||||||
|
print("ocr_items:", len(ocr_items))
|
||||||
|
|
||||||
|
if ocr_items:
|
||||||
|
label = ocr_items[0]["text"]
|
||||||
|
click_text = requests.post(
|
||||||
|
f"{BASE_URL}/interact",
|
||||||
|
headers=headers,
|
||||||
|
json={
|
||||||
|
"screen": SCREEN,
|
||||||
|
"action": {"action": "click_text", "click_text": {"text": label, "match": "exact", "occurrence": "first"}},
|
||||||
|
},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
click_text.raise_for_status()
|
||||||
|
click_data = click_text.json()["data"]
|
||||||
|
target = click_data["resolved_target"]
|
||||||
|
verify = requests.post(
|
||||||
|
f"{BASE_URL}/interact/verify",
|
||||||
|
headers=headers,
|
||||||
|
json={
|
||||||
|
"action": {"screen": SCREEN, "action": {"action": "click", "target": {"mode": "pixel", "x": target["x"], "y": target["y"]}}},
|
||||||
|
"verify": {
|
||||||
|
"type": "ocr_text_near_point",
|
||||||
|
"text": label,
|
||||||
|
"x": target["x"],
|
||||||
|
"y": target["y"],
|
||||||
|
"radius": 150,
|
||||||
|
"screen": SCREEN,
|
||||||
|
},
|
||||||
|
"timeout_ms": 1500,
|
||||||
|
},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
verify.raise_for_status()
|
||||||
|
print("verify:", verify.json()["data"]["verified"])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -8,13 +8,15 @@ from fastapi.exceptions import RequestValidationError
|
|||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
from .config import SETTINGS
|
from .config import SETTINGS
|
||||||
from .models import ExecRequest, InteractRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery
|
from .models import ExecRequest, InteractRequest, InteractVerifyRequest, LaunchRequest, SeeRequest, SeeZoomRequest, WindowActionRequest, WindowQuery
|
||||||
from .services import (
|
from .services import (
|
||||||
apply_window_action,
|
apply_window_action,
|
||||||
capture_region_image,
|
capture_region_image,
|
||||||
capture_screen,
|
capture_screen,
|
||||||
draw_grid,
|
draw_grid,
|
||||||
encode_image,
|
encode_image,
|
||||||
|
execute_and_verify,
|
||||||
|
extract_ocr_items,
|
||||||
exec_action,
|
exec_action,
|
||||||
exec_command as run_exec_command,
|
exec_command as run_exec_command,
|
||||||
get_displays,
|
get_displays,
|
||||||
@@ -65,7 +67,8 @@ async def _http_exception_handler(_: Request, exc: HTTPException):
|
|||||||
detail = exc.detail
|
detail = exc.detail
|
||||||
if isinstance(detail, dict):
|
if isinstance(detail, dict):
|
||||||
message = str(detail.get("message", "request failed"))
|
message = str(detail.get("message", "request failed"))
|
||||||
return _err("http_error", message, exc.status_code, detail)
|
code = str(detail.get("code", "http_error"))
|
||||||
|
return _err(code, message, exc.status_code, detail.get("details"))
|
||||||
return _err("http_error", str(detail), exc.status_code)
|
return _err("http_error", str(detail), exc.status_code)
|
||||||
|
|
||||||
|
|
||||||
@@ -99,6 +102,8 @@ def see(req: SeeRequest, _: None = Depends(_auth)):
|
|||||||
if req.with_grid:
|
if req.with_grid:
|
||||||
out_img, grid_meta = draw_grid(image, region["x"], region["y"], req.grid_rows, req.grid_cols, req.include_labels)
|
out_img, grid_meta = draw_grid(image, region["x"], region["y"], req.grid_rows, req.grid_cols, req.include_labels)
|
||||||
meta.update(grid_meta)
|
meta.update(grid_meta)
|
||||||
|
if req.ocr:
|
||||||
|
meta["ocr"] = extract_ocr_items(image, region["x"], region["y"], req.ocr_min_confidence, req.ocr_lang, req.ocr_psm)
|
||||||
return _ok(
|
return _ok(
|
||||||
{
|
{
|
||||||
"image": {
|
"image": {
|
||||||
@@ -154,6 +159,11 @@ def interact(req: InteractRequest, _: None = Depends(_auth)):
|
|||||||
return _ok(exec_action(req.action, req.screen))
|
return _ok(exec_action(req.action, req.screen))
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/interact/verify")
|
||||||
|
def interact_verify(req: InteractVerifyRequest, _: None = Depends(_auth)):
|
||||||
|
return _ok(execute_and_verify(req))
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
def health(_: None = Depends(_auth)):
|
def health(_: None = Depends(_auth)):
|
||||||
return _ok(
|
return _ok(
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ class ActionRequest(BaseModel):
|
|||||||
"scroll",
|
"scroll",
|
||||||
"type",
|
"type",
|
||||||
"hotkey",
|
"hotkey",
|
||||||
|
"click_text",
|
||||||
]
|
]
|
||||||
target: Optional[Target] = None
|
target: Optional[Target] = None
|
||||||
duration_ms: int = Field(default=0, ge=0, le=20000)
|
duration_ms: int = Field(default=0, ge=0, le=20000)
|
||||||
@@ -58,6 +59,13 @@ class ActionRequest(BaseModel):
|
|||||||
keys: list[str] = Field(default_factory=list)
|
keys: list[str] = Field(default_factory=list)
|
||||||
interval_ms: int = Field(default=20, ge=0, le=5000)
|
interval_ms: int = Field(default=20, ge=0, le=5000)
|
||||||
dry_run: bool = False
|
dry_run: bool = False
|
||||||
|
click_text: "ClickTextAction | None" = None
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def _validate_click_text(self):
|
||||||
|
if self.action == "click_text" and self.click_text is None:
|
||||||
|
raise ValueError("click_text payload is required when action=click_text")
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class ExecRequest(BaseModel):
|
class ExecRequest(BaseModel):
|
||||||
@@ -103,6 +111,10 @@ class SeeRequest(BaseModel):
|
|||||||
include_labels: bool = True
|
include_labels: bool = True
|
||||||
image_format: Literal["png", "jpeg"] = "png"
|
image_format: Literal["png", "jpeg"] = "png"
|
||||||
jpeg_quality: int = Field(default=85, ge=1, le=100)
|
jpeg_quality: int = Field(default=85, ge=1, le=100)
|
||||||
|
ocr: bool = False
|
||||||
|
ocr_min_confidence: float = Field(default=0.0, ge=0.0, le=100.0)
|
||||||
|
ocr_lang: str = Field(default="eng", min_length=1, max_length=64)
|
||||||
|
ocr_psm: int | None = Field(default=None, ge=0, le=13)
|
||||||
|
|
||||||
|
|
||||||
class SeeZoomRequest(BaseModel):
|
class SeeZoomRequest(BaseModel):
|
||||||
@@ -122,3 +134,55 @@ class SeeZoomRequest(BaseModel):
|
|||||||
class InteractRequest(BaseModel):
|
class InteractRequest(BaseModel):
|
||||||
screen: int = 0
|
screen: int = 0
|
||||||
action: ActionRequest
|
action: ActionRequest
|
||||||
|
|
||||||
|
|
||||||
|
class OCRRegion(BaseModel):
|
||||||
|
x: int = Field(ge=0)
|
||||||
|
y: int = Field(ge=0)
|
||||||
|
width: int = Field(gt=0)
|
||||||
|
height: int = Field(gt=0)
|
||||||
|
|
||||||
|
|
||||||
|
class ClickTextAction(BaseModel):
|
||||||
|
text: str = Field(min_length=1, max_length=1000)
|
||||||
|
match: Literal["contains", "exact", "regex"] = "contains"
|
||||||
|
region: OCRRegion | None = None
|
||||||
|
screen: int | None = None
|
||||||
|
case_sensitive: bool = False
|
||||||
|
min_confidence: float = Field(default=0.0, ge=0.0, le=100.0)
|
||||||
|
occurrence: Literal["first", "best", "nth"] = "first"
|
||||||
|
nth: int | None = Field(default=None, ge=1, le=10000)
|
||||||
|
ocr_lang: str = Field(default="eng", min_length=1, max_length=64)
|
||||||
|
ocr_psm: int | None = Field(default=None, ge=0, le=13)
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def _validate_nth(self):
|
||||||
|
if self.occurrence == "nth" and self.nth is None:
|
||||||
|
raise ValueError("nth is required when occurrence=nth")
|
||||||
|
if self.occurrence != "nth" and self.nth is not None:
|
||||||
|
raise ValueError("nth is only allowed when occurrence=nth")
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class VerifyOCRTextNearPoint(BaseModel):
|
||||||
|
type: Literal["ocr_text_near_point"]
|
||||||
|
text: str = Field(min_length=1, max_length=1000)
|
||||||
|
x: int = Field(ge=0)
|
||||||
|
y: int = Field(ge=0)
|
||||||
|
radius: int = Field(default=80, ge=1, le=1000)
|
||||||
|
screen: int = 0
|
||||||
|
match: Literal["contains", "exact", "regex"] = "contains"
|
||||||
|
case_sensitive: bool = False
|
||||||
|
min_confidence: float = Field(default=0.0, ge=0.0, le=100.0)
|
||||||
|
ocr_lang: str = Field(default="eng", min_length=1, max_length=64)
|
||||||
|
ocr_psm: int | None = Field(default=None, ge=0, le=13)
|
||||||
|
|
||||||
|
|
||||||
|
class InteractVerifyRequest(BaseModel):
|
||||||
|
action: InteractRequest
|
||||||
|
verify: VerifyOCRTextNearPoint
|
||||||
|
check_interval_ms: int = Field(default=250, ge=50, le=5000)
|
||||||
|
timeout_ms: int = Field(default=3000, ge=100, le=60000)
|
||||||
|
|
||||||
|
|
||||||
|
ActionRequest.model_rebuild()
|
||||||
|
|||||||
@@ -11,7 +11,22 @@ from fastapi import HTTPException
|
|||||||
from PIL import ImageChops, ImageStat
|
from PIL import ImageChops, ImageStat
|
||||||
|
|
||||||
from .config import SETTINGS
|
from .config import SETTINGS
|
||||||
from .models import ActionRequest, GridTarget, LaunchRequest, PixelTarget, Target, WindowActionRequest, WindowQuery
|
from .models import (
|
||||||
|
ActionRequest,
|
||||||
|
ClickTextAction,
|
||||||
|
GridTarget,
|
||||||
|
InteractVerifyRequest,
|
||||||
|
LaunchRequest,
|
||||||
|
PixelTarget,
|
||||||
|
Target,
|
||||||
|
VerifyOCRTextNearPoint,
|
||||||
|
WindowActionRequest,
|
||||||
|
WindowQuery,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def api_error(status_code: int, code: str, message: str, details=None):
|
||||||
|
raise HTTPException(status_code=status_code, detail={"code": code, "message": message, "details": details})
|
||||||
|
|
||||||
|
|
||||||
def import_capture_libs():
|
def import_capture_libs():
|
||||||
@@ -85,6 +100,50 @@ def capture_region_image(screen: int, region_x: int | None, region_y: int | None
|
|||||||
return crop, {"x": region_x, "y": region_y, "width": region_width, "height": region_height}, mon, displays, screen_selection
|
return crop, {"x": region_x, "y": region_y, "width": region_width, "height": region_height}, mon, displays, screen_selection
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ocr_items(image, origin_x: int, origin_y: int, min_confidence: float, lang: str, psm: int | None) -> list[dict]:
|
||||||
|
try:
|
||||||
|
import pytesseract
|
||||||
|
except Exception as exc:
|
||||||
|
api_error(503, "ocr_unavailable", f"pytesseract unavailable: {exc}")
|
||||||
|
|
||||||
|
config = ""
|
||||||
|
if psm is not None:
|
||||||
|
config = f"--psm {psm}"
|
||||||
|
try:
|
||||||
|
data = pytesseract.image_to_data(image, lang=lang, config=config, output_type=pytesseract.Output.DICT)
|
||||||
|
except Exception as exc:
|
||||||
|
api_error(503, "ocr_failed", f"ocr failed: {exc}")
|
||||||
|
|
||||||
|
out: list[dict] = []
|
||||||
|
n = len(data.get("text", []))
|
||||||
|
for i in range(n):
|
||||||
|
text = (data["text"][i] or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
confidence = float(data["conf"][i])
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if confidence < min_confidence:
|
||||||
|
continue
|
||||||
|
left = int(data["left"][i])
|
||||||
|
top = int(data["top"][i])
|
||||||
|
width = int(data["width"][i])
|
||||||
|
height = int(data["height"][i])
|
||||||
|
bbox = {"x": origin_x + left, "y": origin_y + top, "width": width, "height": height}
|
||||||
|
center = {"x": bbox["x"] + (width // 2), "y": bbox["y"] + (height // 2)}
|
||||||
|
out.append(
|
||||||
|
{
|
||||||
|
"text": text,
|
||||||
|
"confidence": confidence,
|
||||||
|
"bbox": bbox,
|
||||||
|
"center": center,
|
||||||
|
"region_relative_bbox": {"x": left, "y": top, "width": width, "height": height},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def serialize_image(image, image_format: str, jpeg_quality: int) -> bytes:
|
def serialize_image(image, image_format: str, jpeg_quality: int) -> bytes:
|
||||||
buf = io.BytesIO()
|
buf = io.BytesIO()
|
||||||
if image_format == "jpeg":
|
if image_format == "jpeg":
|
||||||
@@ -164,6 +223,39 @@ def enforce_allowed_region(x: int, y: int):
|
|||||||
raise HTTPException(status_code=403, detail="point outside allowed region")
|
raise HTTPException(status_code=403, detail="point outside allowed region")
|
||||||
|
|
||||||
|
|
||||||
|
def _text_matches(candidate: str, needle: str, mode: str, case_sensitive: bool) -> bool:
|
||||||
|
hay = candidate if case_sensitive else candidate.lower()
|
||||||
|
ndl = needle if case_sensitive else needle.lower()
|
||||||
|
if mode == "contains":
|
||||||
|
return ndl in hay
|
||||||
|
if mode == "exact":
|
||||||
|
return hay == ndl
|
||||||
|
flags = 0 if case_sensitive else re.IGNORECASE
|
||||||
|
return re.search(needle, candidate, flags=flags) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_text_match(click_text: ClickTextAction, items: list[dict]) -> dict:
|
||||||
|
matches = [item for item in items if _text_matches(item["text"], click_text.text, click_text.match, click_text.case_sensitive)]
|
||||||
|
if not matches:
|
||||||
|
candidates = [item["text"] for item in sorted(items, key=lambda v: v["confidence"], reverse=True)[:8]]
|
||||||
|
api_error(404, "ocr_text_not_found", "no OCR text matched", {"query": click_text.text, "candidates": candidates})
|
||||||
|
if click_text.occurrence == "best":
|
||||||
|
return max(matches, key=lambda item: item["confidence"])
|
||||||
|
if click_text.occurrence == "nth":
|
||||||
|
idx = (click_text.nth or 1) - 1
|
||||||
|
if idx >= len(matches):
|
||||||
|
api_error(409, "ocr_nth_out_of_range", "requested nth match is out of range", {"match_count": len(matches), "nth": click_text.nth})
|
||||||
|
return matches[idx]
|
||||||
|
if len(matches) > 1 and click_text.match == "exact":
|
||||||
|
api_error(
|
||||||
|
409,
|
||||||
|
"ocr_text_ambiguous",
|
||||||
|
"multiple OCR entries matched",
|
||||||
|
{"match_count": len(matches), "candidates": [item["text"] for item in matches[:8]]},
|
||||||
|
)
|
||||||
|
return matches[0]
|
||||||
|
|
||||||
|
|
||||||
def import_input_lib():
|
def import_input_lib():
|
||||||
try:
|
try:
|
||||||
import pyautogui
|
import pyautogui
|
||||||
@@ -176,7 +268,10 @@ def import_input_lib():
|
|||||||
|
|
||||||
def exec_action(req: ActionRequest, screen: int = 0) -> dict:
|
def exec_action(req: ActionRequest, screen: int = 0) -> dict:
|
||||||
run_dry = SETTINGS["dry_run"] or req.dry_run
|
run_dry = SETTINGS["dry_run"] or req.dry_run
|
||||||
selected_display, _, screen_selection = select_display(screen)
|
action_screen = screen
|
||||||
|
if req.action == "click_text" and req.click_text and req.click_text.screen is not None:
|
||||||
|
action_screen = req.click_text.screen
|
||||||
|
selected_display, _, screen_selection = select_display(action_screen)
|
||||||
pyautogui = None if run_dry else import_input_lib()
|
pyautogui = None if run_dry else import_input_lib()
|
||||||
resolved_target = None
|
resolved_target = None
|
||||||
|
|
||||||
@@ -191,6 +286,36 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict:
|
|||||||
if req.action == "scroll" and resolved_target is None:
|
if req.action == "scroll" and resolved_target is None:
|
||||||
raise HTTPException(status_code=400, detail="target is required for scroll")
|
raise HTTPException(status_code=400, detail="target is required for scroll")
|
||||||
|
|
||||||
|
click_text_match = None
|
||||||
|
if req.action == "click_text":
|
||||||
|
if req.click_text is None:
|
||||||
|
api_error(400, "click_text_payload_required", "click_text payload is required")
|
||||||
|
region = req.click_text.region
|
||||||
|
img, captured_region, _, _, _ = capture_region_image(
|
||||||
|
action_screen,
|
||||||
|
None if region is None else region.x,
|
||||||
|
None if region is None else region.y,
|
||||||
|
None if region is None else region.width,
|
||||||
|
None if region is None else region.height,
|
||||||
|
)
|
||||||
|
items = extract_ocr_items(
|
||||||
|
img,
|
||||||
|
captured_region["x"],
|
||||||
|
captured_region["y"],
|
||||||
|
req.click_text.min_confidence,
|
||||||
|
req.click_text.ocr_lang,
|
||||||
|
req.click_text.ocr_psm,
|
||||||
|
)
|
||||||
|
matched = _resolve_text_match(req.click_text, items)
|
||||||
|
enforce_allowed_region(matched["center"]["x"], matched["center"]["y"])
|
||||||
|
click_text_match = {
|
||||||
|
"query": req.click_text.model_dump(),
|
||||||
|
"matched": matched,
|
||||||
|
"capture_region": captured_region,
|
||||||
|
"screen": screen_selection,
|
||||||
|
}
|
||||||
|
resolved_target = {"x": matched["center"]["x"], "y": matched["center"]["y"], "target_info": {"mode": "ocr_text"}}
|
||||||
|
|
||||||
if not run_dry:
|
if not run_dry:
|
||||||
if req.action == "move":
|
if req.action == "move":
|
||||||
pyautogui.moveTo(resolved_target["x"], resolved_target["y"], duration=duration_sec)
|
pyautogui.moveTo(resolved_target["x"], resolved_target["y"], duration=duration_sec)
|
||||||
@@ -211,8 +336,71 @@ def exec_action(req: ActionRequest, screen: int = 0) -> dict:
|
|||||||
if len(req.keys) < 1:
|
if len(req.keys) < 1:
|
||||||
raise HTTPException(status_code=400, detail="keys is required for hotkey")
|
raise HTTPException(status_code=400, detail="keys is required for hotkey")
|
||||||
pyautogui.hotkey(*req.keys)
|
pyautogui.hotkey(*req.keys)
|
||||||
|
elif req.action == "click_text":
|
||||||
|
pyautogui.click(
|
||||||
|
x=resolved_target["x"],
|
||||||
|
y=resolved_target["y"],
|
||||||
|
clicks=req.clicks,
|
||||||
|
interval=req.interval_ms / 1000.0,
|
||||||
|
button=req.button,
|
||||||
|
duration=duration_sec,
|
||||||
|
)
|
||||||
|
|
||||||
return {"action": req.action, "executed": not run_dry, "dry_run": run_dry, "screen": screen_selection, "display": selected_display, "resolved_target": resolved_target}
|
return {
|
||||||
|
"action": req.action,
|
||||||
|
"executed": not run_dry,
|
||||||
|
"dry_run": run_dry,
|
||||||
|
"screen": screen_selection,
|
||||||
|
"display": selected_display,
|
||||||
|
"resolved_target": resolved_target,
|
||||||
|
"click_text_match": click_text_match,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _verify_ocr_text_near_point(spec: VerifyOCRTextNearPoint) -> dict:
|
||||||
|
radius = spec.radius
|
||||||
|
display, _, _ = select_display(spec.screen)
|
||||||
|
region_x = max(display["x"], spec.x - radius)
|
||||||
|
region_y = max(display["y"], spec.y - radius)
|
||||||
|
max_right = display["x"] + display["width"]
|
||||||
|
max_bottom = display["y"] + display["height"]
|
||||||
|
region_right = min(max_right, spec.x + radius)
|
||||||
|
region_bottom = min(max_bottom, spec.y + radius)
|
||||||
|
region_w = max(1, region_right - region_x)
|
||||||
|
region_h = max(1, region_bottom - region_y)
|
||||||
|
img, region, _, _, screen_selection = capture_region_image(spec.screen, region_x, region_y, region_w, region_h)
|
||||||
|
items = extract_ocr_items(img, region["x"], region["y"], spec.min_confidence, spec.ocr_lang, spec.ocr_psm)
|
||||||
|
matches = [item for item in items if _text_matches(item["text"], spec.text, spec.match, spec.case_sensitive)]
|
||||||
|
return {"ok": len(matches) > 0, "matches": matches[:8], "items_count": len(items), "screen": screen_selection, "region": region}
|
||||||
|
|
||||||
|
|
||||||
|
def execute_and_verify(req: InteractVerifyRequest) -> dict:
|
||||||
|
started = time.time()
|
||||||
|
action_result = exec_action(req.action.action, req.action.screen)
|
||||||
|
attempts = 0
|
||||||
|
last_check = None
|
||||||
|
deadline = started + (req.timeout_ms / 1000.0)
|
||||||
|
while True:
|
||||||
|
attempts += 1
|
||||||
|
check = _verify_ocr_text_near_point(req.verify)
|
||||||
|
last_check = check
|
||||||
|
if check["ok"]:
|
||||||
|
return {
|
||||||
|
"action_result": action_result,
|
||||||
|
"verified": True,
|
||||||
|
"attempts": attempts,
|
||||||
|
"last_check": last_check,
|
||||||
|
"duration_ms": int((time.time() - started) * 1000),
|
||||||
|
}
|
||||||
|
if time.time() >= deadline:
|
||||||
|
return {
|
||||||
|
"action_result": action_result,
|
||||||
|
"verified": False,
|
||||||
|
"attempts": attempts,
|
||||||
|
"last_check": last_check,
|
||||||
|
"duration_ms": int((time.time() - started) * 1000),
|
||||||
|
}
|
||||||
|
time.sleep(req.check_interval_ms / 1000.0)
|
||||||
|
|
||||||
|
|
||||||
def windows_only(feature: str):
|
def windows_only(feature: str):
|
||||||
|
|||||||
@@ -5,21 +5,24 @@ description: Use 3 methods to control a computer: see (screenshot+grid), interac
|
|||||||
|
|
||||||
# Clickthrough Computer Control
|
# Clickthrough Computer Control
|
||||||
|
|
||||||
Use exactly 3 methods:
|
Use these methods:
|
||||||
- `see`
|
- `see`
|
||||||
- `interact`
|
- `interact`
|
||||||
|
- `interact/verify`
|
||||||
- `exec`
|
- `exec`
|
||||||
|
|
||||||
## Method 1: See
|
## Method 1: See
|
||||||
|
|
||||||
Use `POST /see` to capture full screen or a region with a grid overlay.
|
Use `POST /see` to capture full screen or a region with a grid overlay.
|
||||||
Use `POST /see/zoom` to capture a tighter crop with a denser grid.
|
Use `POST /see/zoom` to capture a tighter crop with a denser grid.
|
||||||
|
Use `POST /see` with `ocr=true` when text localization is needed.
|
||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
- Start with coarse grid (`12x12`).
|
- Start with coarse grid (`12x12`).
|
||||||
- For precision, zoom and use denser grid (`20x20` or higher).
|
- For precision, zoom and use denser grid (`20x20` or higher).
|
||||||
- Always use returned `meta.region` and `meta.grid` when computing click targets.
|
- Always use returned `meta.region` and `meta.grid` when computing click targets.
|
||||||
- Coordinates are global desktop coordinates.
|
- Coordinates are global desktop coordinates.
|
||||||
|
- OCR results are in `data.meta.ocr` and include confidence, bbox, and center.
|
||||||
|
|
||||||
## Method 2: Interact
|
## Method 2: Interact
|
||||||
|
|
||||||
@@ -27,15 +30,26 @@ Use `POST /interact` for one action at a time.
|
|||||||
|
|
||||||
Mouse actions:
|
Mouse actions:
|
||||||
- `move`, `click`, `right_click`, `double_click`, `middle_click`, `scroll`
|
- `move`, `click`, `right_click`, `double_click`, `middle_click`, `scroll`
|
||||||
|
- `click_text` (OCR-driven click; optionally scope with `click_text.region`)
|
||||||
|
|
||||||
Keyboard actions:
|
Keyboard actions:
|
||||||
- `type`, `hotkey`
|
- `type`, `hotkey`
|
||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
- Prefer `grid` targets derived from fresh `see`/`see/zoom` captures.
|
- Prefer `grid` targets derived from fresh `see`/`see/zoom` captures.
|
||||||
|
- For text buttons/labels, prefer `click_text` and bound OCR with a region when possible.
|
||||||
- Use `pixel` only when you already have reliable coordinates.
|
- Use `pixel` only when you already have reliable coordinates.
|
||||||
- After each important action, call `see` again before continuing.
|
- After each important action, call `see` again before continuing.
|
||||||
|
|
||||||
|
## Method 2.5: Action Verify
|
||||||
|
|
||||||
|
Use `POST /interact/verify` to execute one action and immediately validate nearby OCR text state.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Keep verification narrow: use `ocr_text_near_point` with a focused radius.
|
||||||
|
- Use short intervals/timeouts for speed (`check_interval_ms` ~250, `timeout_ms` 1000-3000).
|
||||||
|
- Prefer this over manual re-check loops when immediate confirmation is required.
|
||||||
|
|
||||||
## Method 3: Exec
|
## Method 3: Exec
|
||||||
|
|
||||||
Use `POST /exec` only for shell/system tasks.
|
Use `POST /exec` only for shell/system tasks.
|
||||||
@@ -49,8 +63,8 @@ Rules:
|
|||||||
|
|
||||||
1. `see` capture.
|
1. `see` capture.
|
||||||
2. If needed, `see/zoom` refine.
|
2. If needed, `see/zoom` refine.
|
||||||
3. `interact` one step.
|
3. `interact` one step (`click_text` for text UI targets).
|
||||||
4. `see` verify.
|
4. `interact/verify` for action->state confirmation, or `see` verify.
|
||||||
5. Repeat.
|
5. Repeat.
|
||||||
|
|
||||||
## Quick Safety Rules
|
## Quick Safety Rules
|
||||||
|
|||||||
139
tests/test_ocr_and_interact.py
Normal file
139
tests/test_ocr_and_interact.py
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from server import services
|
||||||
|
from server.app import app
|
||||||
|
from server.models import ClickTextAction
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_ocr_items_normalization(monkeypatch):
|
||||||
|
class FakeOutput:
|
||||||
|
DICT = "DICT"
|
||||||
|
|
||||||
|
class FakeTesseract:
|
||||||
|
Output = FakeOutput
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def image_to_data(_image, lang, config, output_type):
|
||||||
|
assert lang == "eng"
|
||||||
|
assert output_type == "DICT"
|
||||||
|
return {
|
||||||
|
"text": ["hello", " ", "world"],
|
||||||
|
"conf": ["95.0", "-1", "62.5"],
|
||||||
|
"left": [10, 12, 40],
|
||||||
|
"top": [20, 25, 60],
|
||||||
|
"width": [30, 10, 50],
|
||||||
|
"height": [10, 10, 12],
|
||||||
|
}
|
||||||
|
|
||||||
|
monkeypatch.setitem(sys.modules, "pytesseract", FakeTesseract)
|
||||||
|
items = services.extract_ocr_items(Image.new("RGB", (100, 100)), origin_x=100, origin_y=200, min_confidence=60, lang="eng", psm=None)
|
||||||
|
assert len(items) == 2
|
||||||
|
assert items[0]["text"] == "hello"
|
||||||
|
assert items[0]["bbox"]["x"] == 110
|
||||||
|
assert items[0]["center"]["y"] == 225
|
||||||
|
assert items[1]["text"] == "world"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_text_match_contains_exact_regex_and_nth():
|
||||||
|
items = [
|
||||||
|
{"text": "Save", "confidence": 70},
|
||||||
|
{"text": "Save as", "confidence": 96},
|
||||||
|
{"text": "SAVE", "confidence": 88},
|
||||||
|
]
|
||||||
|
contains = services._resolve_text_match(ClickTextAction(text="save", match="contains", occurrence="first"), items)
|
||||||
|
assert contains["text"] == "Save"
|
||||||
|
best = services._resolve_text_match(ClickTextAction(text="save", match="contains", occurrence="best"), items)
|
||||||
|
assert best["text"] == "Save as"
|
||||||
|
exact_case = services._resolve_text_match(
|
||||||
|
ClickTextAction(text="SAVE", match="exact", case_sensitive=True, occurrence="first"),
|
||||||
|
items,
|
||||||
|
)
|
||||||
|
assert exact_case["text"] == "SAVE"
|
||||||
|
regex_nth = services._resolve_text_match(ClickTextAction(text="^Save", match="regex", occurrence="nth", nth=2), items)
|
||||||
|
assert regex_nth["text"] == "Save as"
|
||||||
|
|
||||||
|
|
||||||
|
def test_interact_click_text_region_optional(monkeypatch):
|
||||||
|
monkeypatch.setattr(services, "select_display", lambda screen: ({"screen": screen}, [], {"requested": screen, "selected": screen, "fallback": False}))
|
||||||
|
monkeypatch.setattr(
|
||||||
|
services,
|
||||||
|
"capture_region_image",
|
||||||
|
lambda screen, x, y, w, h: (Image.new("RGB", (20, 20)), {"x": x or 0, "y": y or 0, "width": w or 20, "height": h or 20}, {}, [], {}),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
services,
|
||||||
|
"extract_ocr_items",
|
||||||
|
lambda *args, **kwargs: [
|
||||||
|
{
|
||||||
|
"text": "Apply",
|
||||||
|
"confidence": 93.0,
|
||||||
|
"bbox": {"x": 10, "y": 20, "width": 20, "height": 10},
|
||||||
|
"center": {"x": 20, "y": 25},
|
||||||
|
"region_relative_bbox": {"x": 10, "y": 20, "width": 20, "height": 10},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
response = client.post(
|
||||||
|
"/interact",
|
||||||
|
json={"screen": 0, "action": {"action": "click_text", "dry_run": True, "click_text": {"text": "Apply", "match": "contains"}}},
|
||||||
|
)
|
||||||
|
assert response.status_code == 200
|
||||||
|
body = response.json()["data"]
|
||||||
|
assert body["resolved_target"]["x"] == 20
|
||||||
|
assert body["click_text_match"]["matched"]["text"] == "Apply"
|
||||||
|
|
||||||
|
|
||||||
|
def test_see_ocr_off_on_contract(monkeypatch):
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"server.app.capture_region_image",
|
||||||
|
lambda *args, **kwargs: (Image.new("RGB", (10, 10)), {"x": 0, "y": 0, "width": 10, "height": 10}, {"screen": 0}, [], {}),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("server.app.encode_image", lambda *args, **kwargs: "abc")
|
||||||
|
monkeypatch.setattr("server.app.extract_ocr_items", lambda *args, **kwargs: [{"text": "x"}])
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
off = client.post("/see", json={"ocr": False, "with_grid": False})
|
||||||
|
assert off.status_code == 200
|
||||||
|
assert "ocr" not in off.json()["data"]["meta"]
|
||||||
|
on = client.post("/see", json={"ocr": True, "with_grid": False})
|
||||||
|
assert on.status_code == 200
|
||||||
|
assert on.json()["data"]["meta"]["ocr"][0]["text"] == "x"
|
||||||
|
|
||||||
|
|
||||||
|
def test_interact_verify_success_and_timeout(monkeypatch):
|
||||||
|
calls = {"n": 0}
|
||||||
|
monkeypatch.setattr(services, "exec_action", lambda action, screen=0: {"action": action.action, "executed": True})
|
||||||
|
|
||||||
|
def fake_verify(_spec):
|
||||||
|
calls["n"] += 1
|
||||||
|
return {"ok": calls["n"] >= 2, "matches": [], "items_count": 1, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}}
|
||||||
|
|
||||||
|
monkeypatch.setattr(services, "_verify_ocr_text_near_point", fake_verify)
|
||||||
|
client = TestClient(app)
|
||||||
|
payload = {
|
||||||
|
"action": {"screen": 0, "action": {"action": "type", "text": "hello"}},
|
||||||
|
"verify": {"type": "ocr_text_near_point", "text": "hello", "x": 100, "y": 100},
|
||||||
|
"check_interval_ms": 10,
|
||||||
|
"timeout_ms": 500,
|
||||||
|
}
|
||||||
|
ok_resp = client.post("/interact/verify", json=payload)
|
||||||
|
assert ok_resp.status_code == 200
|
||||||
|
ok_data = ok_resp.json()["data"]
|
||||||
|
assert ok_data["verified"] is True
|
||||||
|
assert ok_data["attempts"] == 2
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
services,
|
||||||
|
"_verify_ocr_text_near_point",
|
||||||
|
lambda _spec: {"ok": False, "matches": [], "items_count": 0, "screen": {"selected": 0}, "region": {"x": 0, "y": 0, "width": 1, "height": 1}},
|
||||||
|
)
|
||||||
|
timeout_resp = client.post("/interact/verify", json=payload)
|
||||||
|
assert timeout_resp.status_code == 200
|
||||||
|
timeout_data = timeout_resp.json()["data"]
|
||||||
|
assert timeout_data["verified"] is False
|
||||||
|
assert timeout_data["attempts"] >= 1
|
||||||
Reference in New Issue
Block a user