feat(ocr): add higher-level text search helpers

2026-05-01 16:23:16 +02:00
parent 8857feaf7b
commit f00c525721
4 changed files with 190 additions and 35 deletions
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
 - **Window lifecycle endpoints**: list/focus/restore/minimize/maximize/close windows via `GET /windows` + `POST /windows/action`
 - **Structured launch endpoint**: start an app/process without dropping to a shell via `POST /launch`
 - **Wait/sync endpoint**: poll for text, window, or visual state changes via `POST /wait`
- **OCR endpoint**: extract text blocks with bounding boxes via `POST /ocr`
+- **OCR endpoints**: extract text blocks or search for matching text via `POST /ocr` and `POST /ocr/find`
 - **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
 - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
 - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
@@ -39,7 +39,7 @@ For OCR support, install the native `tesseract` binary on the host (in addition
 3. Decide cell / target
 4. Optional `POST /zoom?screen=0` for finer targeting
 5. `POST /action?screen=0` to execute
-6. `GET /screen?screen=0` again to verify result
+6. `GET /screen?screen=0` again to verify result, or use `POST /ocr/find` when you need explicit text matching

 Important:
 - `POST /action` expects an `action` plus a `target` object; do not send raw top-level `x` / `y` fields.
--- a/docs/API.md
+++ b/docs/API.md
@@ -432,6 +432,41 @@ Notes:
 - Requires `tesseract` executable plus Python package `pytesseract`.
 - If `tesseract` is not on `PATH`, set `CLICKTHROUGH_TESSERACT_CMD` to the full executable path.

+## `POST /ocr/find`
+
+Search OCR output for matching text instead of post-processing raw OCR blocks client-side.
+
+Query params:
+
+- `screen` (int, default `0`) - used for `mode=screen` and `mode=region`
+
+```json
+{
+  "mode": "screen",
+  "query": "Settings",
+  "match": "contains",
+  "group_lines": true,
+  "max_results": 10,
+  "language_hint": "eng",
+  "min_confidence": 0.4
+}
+```
+
+Modes:
+- `screen`
+- `region`
+- `image`
+
+Options:
+- `match`: `contains`, `exact`, or `regex`
+- `group_lines=true`: combine nearby OCR words into line-level candidates before matching
+- `max_results`: result cap after confidence sorting
+
+Response includes:
+- `matches` — confidence-sorted candidate matches
+- `match_count`
+- `blocks_considered`
+
 ## `POST /exec`

 Execute a shell command on the host running Clickthrough.
--- a/server/app.py
+++ b/server/app.py
@@ -239,6 +239,12 @@ class WaitRequest(BaseModel):
    poll_interval_ms: int = Field(default=250, ge=50, le=10000)


+class OCRFindRequest(OCRRequest):
+    query: str = Field(min_length=1, max_length=512)
+    match: Literal["contains", "exact", "regex"] = "contains"
+    group_lines: bool = True
+    max_results: int = Field(default=20, ge=1, le=200)
+


 def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
@@ -445,7 +451,11 @@ def _import_ocr_libs():


 def _decode_image_base64(value: str):
-    Image, _, _ = _import_capture_libs()
+    try:
+        from PIL import Image
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=f"image decode backend unavailable: {exc}") from exc
+
    payload = value.strip()
    if payload.startswith("data:"):
        parts = payload.split(",", 1)
@@ -773,6 +783,36 @@ def _capture_region_image(screen: int, region_x: int | None, region_y: int | Non
    return crop, region, mon, displays, screen_selection


+def _capture_ocr_source(req: OCRRequest, screen: int = 0):
+    source = req.mode
+    if source == "image":
+        image = _decode_image_base64(req.image_base64 or "")
+        region = {"x": 0, "y": 0, "width": image.size[0], "height": image.size[1]}
+        return image, region, None, None, None, source
+
+    base_img, mon, displays, screen_selection = _capture_screen(screen)
+    if source == "screen":
+        image = base_img
+        region = {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}
+        return image, region, mon, displays, screen_selection, source
+
+    left = req.region_x - mon["x"]
+    top = req.region_y - mon["y"]
+    right = left + req.region_width
+    bottom = top + req.region_height
+    if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
+        raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")
+
+    image = base_img.crop((left, top, right, bottom))
+    region = {
+        "x": req.region_x,
+        "y": req.region_y,
+        "width": req.region_width,
+        "height": req.region_height,
+    }
+    return image, region, mon, displays, screen_selection, source
+
+
 def _image_diff_ratio(before, after) -> float:
    diff = ImageChops.difference(before, after)
    stat = ImageStat.Stat(diff)
@@ -780,6 +820,84 @@ def _image_diff_ratio(before, after) -> float:
    return float(sum(channel_means) / (len(channel_means) * 255.0))


+def _merge_bbox(blocks: list[dict]) -> dict:
+    xs = [b["bbox"]["x"] for b in blocks]
+    ys = [b["bbox"]["y"] for b in blocks]
+    rights = [b["bbox"]["x"] + b["bbox"]["width"] for b in blocks]
+    bottoms = [b["bbox"]["y"] + b["bbox"]["height"] for b in blocks]
+    return {
+        "x": min(xs),
+        "y": min(ys),
+        "width": max(rights) - min(xs),
+        "height": max(bottoms) - min(ys),
+    }
+
+
+def _group_ocr_lines(blocks: list[dict]) -> list[dict]:
+    if not blocks:
+        return []
+
+    sorted_blocks = sorted(blocks, key=lambda b: (b["bbox"]["y"], b["bbox"]["x"]))
+    lines: list[list[dict]] = []
+    current: list[dict] = []
+    current_center = None
+
+    for block in sorted_blocks:
+        bbox = block["bbox"]
+        center_y = bbox["y"] + (bbox["height"] / 2)
+        tolerance = max(10.0, bbox["height"] * 0.8)
+        if current and current_center is not None and abs(center_y - current_center) > tolerance:
+            lines.append(sorted(current, key=lambda item: item["bbox"]["x"]))
+            current = []
+            current_center = None
+        current.append(block)
+        current_center = sum(item["bbox"]["y"] + (item["bbox"]["height"] / 2) for item in current) / len(current)
+
+    if current:
+        lines.append(sorted(current, key=lambda item: item["bbox"]["x"]))
+
+    grouped = []
+    for idx, line_blocks in enumerate(lines):
+        text = " ".join(item["text"] for item in line_blocks).strip()
+        if not text:
+            continue
+        grouped.append(
+            {
+                "text": text,
+                "confidence": round(sum(item["confidence"] for item in line_blocks) / len(line_blocks), 4),
+                "bbox": _merge_bbox(line_blocks),
+                "blocks": line_blocks,
+                "line_index": idx,
+            }
+        )
+    return grouped
+
+
+def _find_text_matches(blocks: list[dict], query: str, match_mode: str, group_lines: bool, max_results: int) -> list[dict]:
+    target = _normalize_text(query)
+    candidates = _group_ocr_lines(blocks) if group_lines else blocks
+    matches = []
+    for item in candidates:
+        normalized = _normalize_text(item["text"])
+        if not normalized:
+            continue
+        if _matches_text(normalized, target, match_mode):
+            match = {
+                "text": item["text"],
+                "normalized_text": normalized,
+                "confidence": item["confidence"],
+                "bbox": item["bbox"],
+                "grouped": group_lines,
+            }
+            if group_lines:
+                match["blocks"] = item["blocks"]
+                match["line_index"] = item["line_index"]
+            matches.append(match)
+
+    matches.sort(key=lambda item: (-item["confidence"], item["bbox"]["y"], item["bbox"]["x"]))
+    return matches[:max_results]
+
+
 def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict:
    condition = req.condition
    deadline = time.time() + (req.timeout_ms / 1000.0)
@@ -1317,38 +1435,10 @@ def wait(req: WaitRequest, screen: int = 0, _: None = Depends(_auth)):

@app.post("/ocr")
 def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
-    source = req.mode
-    if source == "image":
-        image = _decode_image_base64(req.image_base64 or "")
-        region = {"x": 0, "y": 0, "width": image.size[0], "height": image.size[1]}
-        blocks = _run_ocr(image, req.language_hint, req.min_confidence, 0, 0)
-    else:
-        base_img, mon, displays, screen_selection = _capture_screen(screen)
-        if source == "screen":
-            image = base_img
-            region = {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}
-            offset_x = mon["x"]
-            offset_y = mon["y"]
-        else:
-            left = req.region_x - mon["x"]
-            top = req.region_y - mon["y"]
-            right = left + req.region_width
-            bottom = top + req.region_height
-
-            if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
-                raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")
-
-            image = base_img.crop((left, top, right, bottom))
-            region = {
-                "x": req.region_x,
-                "y": req.region_y,
-                "width": req.region_width,
-                "height": req.region_height,
-            }
-            offset_x = req.region_x
-            offset_y = req.region_y
-
-        blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)
+    image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen)
+    offset_x = region["x"] if source != "image" else 0
+    offset_y = region["y"] if source != "image" else 0
+    blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)

    return {
        "ok": True,
@@ -1366,6 +1456,35 @@ def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
    }


+@app.post("/ocr/find")
+def ocr_find(req: OCRFindRequest, screen: int = 0, _: None = Depends(_auth)):
+    image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen)
+    offset_x = region["x"] if source != "image" else 0
+    offset_y = region["y"] if source != "image" else 0
+    blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)
+    matches = _find_text_matches(blocks, req.query, req.match, req.group_lines, req.max_results)
+
+    return {
+        "ok": True,
+        "request_id": _request_id(),
+        "time_ms": _now_ms(),
+        "result": {
+            "mode": source,
+            "screen": screen_selection if source != "image" else None,
+            "display": mon if source != "image" else None,
+            "language_hint": req.language_hint,
+            "min_confidence": req.min_confidence,
+            "query": req.query,
+            "match": req.match,
+            "group_lines": req.group_lines,
+            "region": region,
+            "matches": matches,
+            "match_count": len(matches),
+            "blocks_considered": len(blocks),
+        },
+    }
+
+
@app.post("/batch")
 def batch(req: BatchRequest, screen: int = 0, _: None = Depends(_auth)):
    results = []
--- a/skill/SKILL.md
+++ b/skill/SKILL.md
@@ -55,6 +55,7 @@ Say what you actually have: screenshots, OCR output, and fresh verification capt
 - `POST /launch` → start an app/process without dropping to a shell
 - `POST /wait?screen=0` → wait for text, window, or visual state changes
 - `POST /ocr` → text extraction with bounding boxes from full screen, region, or provided image bytes
+- `POST /ocr/find?screen=0` → search OCR output for matching text candidates
 - `POST /action?screen=0` → single interaction (`move`, `click`, `scroll`, `type`, `hotkey`, ...)
 - `POST /batch?screen=0` → sequential action list
 - `POST /exec` → PowerShell/Bash/CMD command execution (requires configured exec secret + header)