feat(ocr): add higher-level text search helpers
All checks were successful
python-syntax / syntax-check (push) Successful in 6s
All checks were successful
python-syntax / syntax-check (push) Successful in 6s
This commit is contained in:
@@ -11,7 +11,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
|
|||||||
- **Window lifecycle endpoints**: list/focus/restore/minimize/maximize/close windows via `GET /windows` + `POST /windows/action`
|
- **Window lifecycle endpoints**: list/focus/restore/minimize/maximize/close windows via `GET /windows` + `POST /windows/action`
|
||||||
- **Structured launch endpoint**: start an app/process without dropping to a shell via `POST /launch`
|
- **Structured launch endpoint**: start an app/process without dropping to a shell via `POST /launch`
|
||||||
- **Wait/sync endpoint**: poll for text, window, or visual state changes via `POST /wait`
|
- **Wait/sync endpoint**: poll for text, window, or visual state changes via `POST /wait`
|
||||||
- **OCR endpoint**: extract text blocks with bounding boxes via `POST /ocr`
|
- **OCR endpoints**: extract text blocks or search for matching text via `POST /ocr` and `POST /ocr/find`
|
||||||
- **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
|
- **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
|
||||||
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
|
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
|
||||||
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
|
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
|
||||||
@@ -39,7 +39,7 @@ For OCR support, install the native `tesseract` binary on the host (in addition
|
|||||||
3. Decide cell / target
|
3. Decide cell / target
|
||||||
4. Optional `POST /zoom?screen=0` for finer targeting
|
4. Optional `POST /zoom?screen=0` for finer targeting
|
||||||
5. `POST /action?screen=0` to execute
|
5. `POST /action?screen=0` to execute
|
||||||
6. `GET /screen?screen=0` again to verify result
|
6. `GET /screen?screen=0` again to verify result, or use `POST /ocr/find` when you need explicit text matching
|
||||||
|
|
||||||
Important:
|
Important:
|
||||||
- `POST /action` expects an `action` plus a `target` object; do not send raw top-level `x` / `y` fields.
|
- `POST /action` expects an `action` plus a `target` object; do not send raw top-level `x` / `y` fields.
|
||||||
|
|||||||
35
docs/API.md
35
docs/API.md
@@ -432,6 +432,41 @@ Notes:
|
|||||||
- Requires `tesseract` executable plus Python package `pytesseract`.
|
- Requires `tesseract` executable plus Python package `pytesseract`.
|
||||||
- If `tesseract` is not on `PATH`, set `CLICKTHROUGH_TESSERACT_CMD` to the full executable path.
|
- If `tesseract` is not on `PATH`, set `CLICKTHROUGH_TESSERACT_CMD` to the full executable path.
|
||||||
|
|
||||||
|
## `POST /ocr/find`
|
||||||
|
|
||||||
|
Search OCR output for matching text instead of post-processing raw OCR blocks client-side.
|
||||||
|
|
||||||
|
Query params:
|
||||||
|
|
||||||
|
- `screen` (int, default `0`) - used for `mode=screen` and `mode=region`
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mode": "screen",
|
||||||
|
"query": "Settings",
|
||||||
|
"match": "contains",
|
||||||
|
"group_lines": true,
|
||||||
|
"max_results": 10,
|
||||||
|
"language_hint": "eng",
|
||||||
|
"min_confidence": 0.4
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Modes:
|
||||||
|
- `screen`
|
||||||
|
- `region`
|
||||||
|
- `image`
|
||||||
|
|
||||||
|
Options:
|
||||||
|
- `match`: `contains`, `exact`, or `regex`
|
||||||
|
- `group_lines=true`: combine nearby OCR words into line-level candidates before matching
|
||||||
|
- `max_results`: result cap after confidence sorting
|
||||||
|
|
||||||
|
Response includes:
|
||||||
|
- `matches` — confidence-sorted candidate matches
|
||||||
|
- `match_count`
|
||||||
|
- `blocks_considered`
|
||||||
|
|
||||||
## `POST /exec`
|
## `POST /exec`
|
||||||
|
|
||||||
Execute a shell command on the host running Clickthrough.
|
Execute a shell command on the host running Clickthrough.
|
||||||
|
|||||||
183
server/app.py
183
server/app.py
@@ -239,6 +239,12 @@ class WaitRequest(BaseModel):
|
|||||||
poll_interval_ms: int = Field(default=250, ge=50, le=10000)
|
poll_interval_ms: int = Field(default=250, ge=50, le=10000)
|
||||||
|
|
||||||
|
|
||||||
|
class OCRFindRequest(OCRRequest):
|
||||||
|
query: str = Field(min_length=1, max_length=512)
|
||||||
|
match: Literal["contains", "exact", "regex"] = "contains"
|
||||||
|
group_lines: bool = True
|
||||||
|
max_results: int = Field(default=20, ge=1, le=200)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
|
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
|
||||||
@@ -445,7 +451,11 @@ def _import_ocr_libs():
|
|||||||
|
|
||||||
|
|
||||||
def _decode_image_base64(value: str):
|
def _decode_image_base64(value: str):
|
||||||
Image, _, _ = _import_capture_libs()
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=500, detail=f"image decode backend unavailable: {exc}") from exc
|
||||||
|
|
||||||
payload = value.strip()
|
payload = value.strip()
|
||||||
if payload.startswith("data:"):
|
if payload.startswith("data:"):
|
||||||
parts = payload.split(",", 1)
|
parts = payload.split(",", 1)
|
||||||
@@ -773,6 +783,36 @@ def _capture_region_image(screen: int, region_x: int | None, region_y: int | Non
|
|||||||
return crop, region, mon, displays, screen_selection
|
return crop, region, mon, displays, screen_selection
|
||||||
|
|
||||||
|
|
||||||
|
def _capture_ocr_source(req: OCRRequest, screen: int = 0):
|
||||||
|
source = req.mode
|
||||||
|
if source == "image":
|
||||||
|
image = _decode_image_base64(req.image_base64 or "")
|
||||||
|
region = {"x": 0, "y": 0, "width": image.size[0], "height": image.size[1]}
|
||||||
|
return image, region, None, None, None, source
|
||||||
|
|
||||||
|
base_img, mon, displays, screen_selection = _capture_screen(screen)
|
||||||
|
if source == "screen":
|
||||||
|
image = base_img
|
||||||
|
region = {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}
|
||||||
|
return image, region, mon, displays, screen_selection, source
|
||||||
|
|
||||||
|
left = req.region_x - mon["x"]
|
||||||
|
top = req.region_y - mon["y"]
|
||||||
|
right = left + req.region_width
|
||||||
|
bottom = top + req.region_height
|
||||||
|
if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
|
||||||
|
raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")
|
||||||
|
|
||||||
|
image = base_img.crop((left, top, right, bottom))
|
||||||
|
region = {
|
||||||
|
"x": req.region_x,
|
||||||
|
"y": req.region_y,
|
||||||
|
"width": req.region_width,
|
||||||
|
"height": req.region_height,
|
||||||
|
}
|
||||||
|
return image, region, mon, displays, screen_selection, source
|
||||||
|
|
||||||
|
|
||||||
def _image_diff_ratio(before, after) -> float:
|
def _image_diff_ratio(before, after) -> float:
|
||||||
diff = ImageChops.difference(before, after)
|
diff = ImageChops.difference(before, after)
|
||||||
stat = ImageStat.Stat(diff)
|
stat = ImageStat.Stat(diff)
|
||||||
@@ -780,6 +820,84 @@ def _image_diff_ratio(before, after) -> float:
|
|||||||
return float(sum(channel_means) / (len(channel_means) * 255.0))
|
return float(sum(channel_means) / (len(channel_means) * 255.0))
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_bbox(blocks: list[dict]) -> dict:
|
||||||
|
xs = [b["bbox"]["x"] for b in blocks]
|
||||||
|
ys = [b["bbox"]["y"] for b in blocks]
|
||||||
|
rights = [b["bbox"]["x"] + b["bbox"]["width"] for b in blocks]
|
||||||
|
bottoms = [b["bbox"]["y"] + b["bbox"]["height"] for b in blocks]
|
||||||
|
return {
|
||||||
|
"x": min(xs),
|
||||||
|
"y": min(ys),
|
||||||
|
"width": max(rights) - min(xs),
|
||||||
|
"height": max(bottoms) - min(ys),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _group_ocr_lines(blocks: list[dict]) -> list[dict]:
|
||||||
|
if not blocks:
|
||||||
|
return []
|
||||||
|
|
||||||
|
sorted_blocks = sorted(blocks, key=lambda b: (b["bbox"]["y"], b["bbox"]["x"]))
|
||||||
|
lines: list[list[dict]] = []
|
||||||
|
current: list[dict] = []
|
||||||
|
current_center = None
|
||||||
|
|
||||||
|
for block in sorted_blocks:
|
||||||
|
bbox = block["bbox"]
|
||||||
|
center_y = bbox["y"] + (bbox["height"] / 2)
|
||||||
|
tolerance = max(10.0, bbox["height"] * 0.8)
|
||||||
|
if current and current_center is not None and abs(center_y - current_center) > tolerance:
|
||||||
|
lines.append(sorted(current, key=lambda item: item["bbox"]["x"]))
|
||||||
|
current = []
|
||||||
|
current_center = None
|
||||||
|
current.append(block)
|
||||||
|
current_center = sum(item["bbox"]["y"] + (item["bbox"]["height"] / 2) for item in current) / len(current)
|
||||||
|
|
||||||
|
if current:
|
||||||
|
lines.append(sorted(current, key=lambda item: item["bbox"]["x"]))
|
||||||
|
|
||||||
|
grouped = []
|
||||||
|
for idx, line_blocks in enumerate(lines):
|
||||||
|
text = " ".join(item["text"] for item in line_blocks).strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
grouped.append(
|
||||||
|
{
|
||||||
|
"text": text,
|
||||||
|
"confidence": round(sum(item["confidence"] for item in line_blocks) / len(line_blocks), 4),
|
||||||
|
"bbox": _merge_bbox(line_blocks),
|
||||||
|
"blocks": line_blocks,
|
||||||
|
"line_index": idx,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return grouped
|
||||||
|
|
||||||
|
|
||||||
|
def _find_text_matches(blocks: list[dict], query: str, match_mode: str, group_lines: bool, max_results: int) -> list[dict]:
|
||||||
|
target = _normalize_text(query)
|
||||||
|
candidates = _group_ocr_lines(blocks) if group_lines else blocks
|
||||||
|
matches = []
|
||||||
|
for item in candidates:
|
||||||
|
normalized = _normalize_text(item["text"])
|
||||||
|
if not normalized:
|
||||||
|
continue
|
||||||
|
if _matches_text(normalized, target, match_mode):
|
||||||
|
match = {
|
||||||
|
"text": item["text"],
|
||||||
|
"normalized_text": normalized,
|
||||||
|
"confidence": item["confidence"],
|
||||||
|
"bbox": item["bbox"],
|
||||||
|
"grouped": group_lines,
|
||||||
|
}
|
||||||
|
if group_lines:
|
||||||
|
match["blocks"] = item["blocks"]
|
||||||
|
match["line_index"] = item["line_index"]
|
||||||
|
matches.append(match)
|
||||||
|
|
||||||
|
matches.sort(key=lambda item: (-item["confidence"], item["bbox"]["y"], item["bbox"]["x"]))
|
||||||
|
return matches[:max_results]
|
||||||
|
|
||||||
|
|
||||||
def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict:
|
def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict:
|
||||||
condition = req.condition
|
condition = req.condition
|
||||||
deadline = time.time() + (req.timeout_ms / 1000.0)
|
deadline = time.time() + (req.timeout_ms / 1000.0)
|
||||||
@@ -1317,37 +1435,9 @@ def wait(req: WaitRequest, screen: int = 0, _: None = Depends(_auth)):
|
|||||||
|
|
||||||
@app.post("/ocr")
|
@app.post("/ocr")
|
||||||
def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
|
def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
|
||||||
source = req.mode
|
image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen)
|
||||||
if source == "image":
|
offset_x = region["x"] if source != "image" else 0
|
||||||
image = _decode_image_base64(req.image_base64 or "")
|
offset_y = region["y"] if source != "image" else 0
|
||||||
region = {"x": 0, "y": 0, "width": image.size[0], "height": image.size[1]}
|
|
||||||
blocks = _run_ocr(image, req.language_hint, req.min_confidence, 0, 0)
|
|
||||||
else:
|
|
||||||
base_img, mon, displays, screen_selection = _capture_screen(screen)
|
|
||||||
if source == "screen":
|
|
||||||
image = base_img
|
|
||||||
region = {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}
|
|
||||||
offset_x = mon["x"]
|
|
||||||
offset_y = mon["y"]
|
|
||||||
else:
|
|
||||||
left = req.region_x - mon["x"]
|
|
||||||
top = req.region_y - mon["y"]
|
|
||||||
right = left + req.region_width
|
|
||||||
bottom = top + req.region_height
|
|
||||||
|
|
||||||
if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
|
|
||||||
raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")
|
|
||||||
|
|
||||||
image = base_img.crop((left, top, right, bottom))
|
|
||||||
region = {
|
|
||||||
"x": req.region_x,
|
|
||||||
"y": req.region_y,
|
|
||||||
"width": req.region_width,
|
|
||||||
"height": req.region_height,
|
|
||||||
}
|
|
||||||
offset_x = req.region_x
|
|
||||||
offset_y = req.region_y
|
|
||||||
|
|
||||||
blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)
|
blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -1366,6 +1456,35 @@ def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ocr/find")
|
||||||
|
def ocr_find(req: OCRFindRequest, screen: int = 0, _: None = Depends(_auth)):
|
||||||
|
image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen)
|
||||||
|
offset_x = region["x"] if source != "image" else 0
|
||||||
|
offset_y = region["y"] if source != "image" else 0
|
||||||
|
blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)
|
||||||
|
matches = _find_text_matches(blocks, req.query, req.match, req.group_lines, req.max_results)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"request_id": _request_id(),
|
||||||
|
"time_ms": _now_ms(),
|
||||||
|
"result": {
|
||||||
|
"mode": source,
|
||||||
|
"screen": screen_selection if source != "image" else None,
|
||||||
|
"display": mon if source != "image" else None,
|
||||||
|
"language_hint": req.language_hint,
|
||||||
|
"min_confidence": req.min_confidence,
|
||||||
|
"query": req.query,
|
||||||
|
"match": req.match,
|
||||||
|
"group_lines": req.group_lines,
|
||||||
|
"region": region,
|
||||||
|
"matches": matches,
|
||||||
|
"match_count": len(matches),
|
||||||
|
"blocks_considered": len(blocks),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/batch")
|
@app.post("/batch")
|
||||||
def batch(req: BatchRequest, screen: int = 0, _: None = Depends(_auth)):
|
def batch(req: BatchRequest, screen: int = 0, _: None = Depends(_auth)):
|
||||||
results = []
|
results = []
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ Say what you actually have: screenshots, OCR output, and fresh verification capt
|
|||||||
- `POST /launch` → start an app/process without dropping to a shell
|
- `POST /launch` → start an app/process without dropping to a shell
|
||||||
- `POST /wait?screen=0` → wait for text, window, or visual state changes
|
- `POST /wait?screen=0` → wait for text, window, or visual state changes
|
||||||
- `POST /ocr` → text extraction with bounding boxes from full screen, region, or provided image bytes
|
- `POST /ocr` → text extraction with bounding boxes from full screen, region, or provided image bytes
|
||||||
|
- `POST /ocr/find?screen=0` → search OCR output for matching text candidates
|
||||||
- `POST /action?screen=0` → single interaction (`move`, `click`, `scroll`, `type`, `hotkey`, ...)
|
- `POST /action?screen=0` → single interaction (`move`, `click`, `scroll`, `type`, `hotkey`, ...)
|
||||||
- `POST /batch?screen=0` → sequential action list
|
- `POST /batch?screen=0` → sequential action list
|
||||||
- `POST /exec` → PowerShell/Bash/CMD command execution (requires configured exec secret + header)
|
- `POST /exec` → PowerShell/Bash/CMD command execution (requires configured exec secret + header)
|
||||||
|
|||||||
Reference in New Issue
Block a user