feat(ocr): add /ocr endpoint for screen, region, and image input
This commit is contained in:
@@ -7,6 +7,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
|
|||||||
- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes)
|
- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes)
|
||||||
- **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported)
|
- **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported)
|
||||||
- **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
|
- **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
|
||||||
|
- **OCR endpoint**: extract text blocks with bounding boxes via `POST /ocr`
|
||||||
- **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
|
- **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
|
||||||
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
|
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
|
||||||
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
|
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
|
||||||
@@ -23,6 +24,8 @@ CLICKTHROUGH_TOKEN=change-me python -m server.app
|
|||||||
|
|
||||||
Server defaults to `127.0.0.1:8123`.
|
Server defaults to `127.0.0.1:8123`.
|
||||||
|
|
||||||
|
For OCR support, install the native `tesseract` binary on the host (in addition to Python deps).
|
||||||
|
|
||||||
`python-dotenv` is enabled, so values from a repo-root `.env` file are loaded automatically.
|
`python-dotenv` is enabled, so values from a repo-root `.env` file are loaded automatically.
|
||||||
|
|
||||||
## Minimal API flow
|
## Minimal API flow
|
||||||
|
|||||||
71
docs/API.md
71
docs/API.md
@@ -143,6 +143,77 @@ Hotkey:
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## `POST /ocr`
|
||||||
|
|
||||||
|
Extract visible text from either a full screenshot, a region crop, or caller-provided image bytes.
|
||||||
|
|
||||||
|
Body:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mode": "screen",
|
||||||
|
"language_hint": "eng",
|
||||||
|
"min_confidence": 0.4
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Modes:
|
||||||
|
- `screen` (default): OCR over full captured monitor
|
||||||
|
- `region`: OCR over explicit region (`region_x`, `region_y`, `region_width`, `region_height`)
|
||||||
|
- `image`: OCR over provided `image_base64` (supports plain base64 or data URL)
|
||||||
|
|
||||||
|
Region mode example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mode": "region",
|
||||||
|
"region_x": 220,
|
||||||
|
"region_y": 160,
|
||||||
|
"region_width": 900,
|
||||||
|
"region_height": 400,
|
||||||
|
"language_hint": "eng",
|
||||||
|
"min_confidence": 0.5
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Image mode example:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"mode": "image",
|
||||||
|
"image_base64": "iVBORw0KGgoAAAANSUhEUgAA...",
|
||||||
|
"language_hint": "eng"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Response shape:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"ok": true,
|
||||||
|
"request_id": "...",
|
||||||
|
"time_ms": 1710000000000,
|
||||||
|
"result": {
|
||||||
|
"mode": "screen",
|
||||||
|
"language_hint": "eng",
|
||||||
|
"min_confidence": 0.4,
|
||||||
|
"region": {"x": 0, "y": 0, "width": 1920, "height": 1080},
|
||||||
|
"blocks": [
|
||||||
|
{
|
||||||
|
"text": "Settings",
|
||||||
|
"confidence": 0.9821,
|
||||||
|
"bbox": {"x": 144, "y": 92, "width": 96, "height": 21}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- Output is deterministic JSON (stable ordering by top-to-bottom, then left-to-right).
|
||||||
|
- `bbox` coordinates are in global screen space for `screen`/`region`, and image-local for `image`.
|
||||||
|
- Requires `tesseract` executable plus Python package `pytesseract`.
|
||||||
|
|
||||||
## `POST /exec`
|
## `POST /exec`
|
||||||
|
|
||||||
Execute a shell command on the host running Clickthrough.
|
Execute a shell command on the host running Clickthrough.
|
||||||
|
|||||||
@@ -4,3 +4,4 @@ python-dotenv>=1.0.1
|
|||||||
mss>=9.0.1
|
mss>=9.0.1
|
||||||
pillow>=10.4.0
|
pillow>=10.4.0
|
||||||
pyautogui>=0.9.54
|
pyautogui>=0.9.54
|
||||||
|
pytesseract>=0.3.10
|
||||||
|
|||||||
165
server/app.py
165
server/app.py
@@ -146,6 +146,27 @@ class ExecRequest(BaseModel):
|
|||||||
dry_run: bool = False
|
dry_run: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class OCRRequest(BaseModel):
|
||||||
|
mode: Literal["screen", "region", "image"] = "screen"
|
||||||
|
region_x: int | None = Field(default=None, ge=0)
|
||||||
|
region_y: int | None = Field(default=None, ge=0)
|
||||||
|
region_width: int | None = Field(default=None, gt=0)
|
||||||
|
region_height: int | None = Field(default=None, gt=0)
|
||||||
|
image_base64: str | None = None
|
||||||
|
language_hint: str | None = Field(default=None, min_length=1, max_length=64)
|
||||||
|
min_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
||||||
|
|
||||||
|
@model_validator(mode="after")
|
||||||
|
def _validate_mode_inputs(self):
|
||||||
|
if self.mode == "region":
|
||||||
|
required = [self.region_x, self.region_y, self.region_width, self.region_height]
|
||||||
|
if any(v is None for v in required):
|
||||||
|
raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
|
||||||
|
if self.mode == "image" and not self.image_base64:
|
||||||
|
raise ValueError("image_base64 is required for mode=image")
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
|
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
|
||||||
token = SETTINGS["token"]
|
token = SETTINGS["token"]
|
||||||
if token and x_clickthrough_token != token:
|
if token and x_clickthrough_token != token:
|
||||||
@@ -275,6 +296,101 @@ def _import_input_lib():
|
|||||||
raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc
|
raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
def _import_ocr_libs():
|
||||||
|
try:
|
||||||
|
import pytesseract
|
||||||
|
from pytesseract import Output
|
||||||
|
|
||||||
|
return pytesseract, Output
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=500, detail=f"ocr backend unavailable: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_image_base64(value: str):
|
||||||
|
Image, _, _ = _import_capture_libs()
|
||||||
|
payload = value.strip()
|
||||||
|
if payload.startswith("data:"):
|
||||||
|
parts = payload.split(",", 1)
|
||||||
|
if len(parts) != 2:
|
||||||
|
raise HTTPException(status_code=400, detail="invalid data URL image payload")
|
||||||
|
payload = parts[1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
image_bytes = base64.b64decode(payload, validate=True)
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail="invalid image_base64 payload") from exc
|
||||||
|
|
||||||
|
try:
|
||||||
|
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=400, detail="unsupported or unreadable image bytes") from exc
|
||||||
|
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def _run_ocr(image, language_hint: str | None, min_confidence: float, offset_x: int = 0, offset_y: int = 0) -> list[dict]:
|
||||||
|
pytesseract, Output = _import_ocr_libs()
|
||||||
|
|
||||||
|
config = "--oem 3 --psm 6"
|
||||||
|
kwargs = {
|
||||||
|
"image": image,
|
||||||
|
"output_type": Output.DICT,
|
||||||
|
"config": config,
|
||||||
|
}
|
||||||
|
if language_hint:
|
||||||
|
kwargs["lang"] = language_hint
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = pytesseract.image_to_data(**kwargs)
|
||||||
|
except pytesseract.TesseractNotFoundError as exc:
|
||||||
|
raise HTTPException(status_code=500, detail="tesseract executable not found") from exc
|
||||||
|
except pytesseract.TesseractError as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=f"ocr failed: {exc}") from exc
|
||||||
|
|
||||||
|
blocks = []
|
||||||
|
count = len(data.get("text", []))
|
||||||
|
for idx in range(count):
|
||||||
|
text = (data["text"][idx] or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
raw_conf = str(data["conf"][idx]).strip()
|
||||||
|
try:
|
||||||
|
conf_0_100 = float(raw_conf)
|
||||||
|
except ValueError:
|
||||||
|
conf_0_100 = -1.0
|
||||||
|
if conf_0_100 < 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
confidence = round(conf_0_100 / 100.0, 4)
|
||||||
|
if confidence < min_confidence:
|
||||||
|
continue
|
||||||
|
|
||||||
|
left = int(data["left"][idx])
|
||||||
|
top = int(data["top"][idx])
|
||||||
|
width = int(data["width"][idx])
|
||||||
|
height = int(data["height"][idx])
|
||||||
|
|
||||||
|
blocks.append(
|
||||||
|
{
|
||||||
|
"text": text,
|
||||||
|
"confidence": confidence,
|
||||||
|
"bbox": {
|
||||||
|
"x": left + offset_x,
|
||||||
|
"y": top + offset_y,
|
||||||
|
"width": width,
|
||||||
|
"height": height,
|
||||||
|
},
|
||||||
|
"_sort": [top + offset_y, left + offset_x, idx],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
blocks.sort(key=lambda b: (b["_sort"][0], b["_sort"][1], b["_sort"][2]))
|
||||||
|
for block in blocks:
|
||||||
|
block.pop("_sort", None)
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
def _pick_shell(explicit_shell: str | None) -> str:
|
def _pick_shell(explicit_shell: str | None) -> str:
|
||||||
shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip()
|
shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip()
|
||||||
if shell_name not in {"powershell", "bash", "cmd"}:
|
if shell_name not in {"powershell", "bash", "cmd"}:
|
||||||
@@ -600,6 +716,55 @@ def exec_command(
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ocr")
|
||||||
|
def ocr(req: OCRRequest, _: None = Depends(_auth)):
|
||||||
|
source = req.mode
|
||||||
|
if source == "image":
|
||||||
|
image = _decode_image_base64(req.image_base64 or "")
|
||||||
|
region = {"x": 0, "y": 0, "width": image.size[0], "height": image.size[1]}
|
||||||
|
blocks = _run_ocr(image, req.language_hint, req.min_confidence, 0, 0)
|
||||||
|
else:
|
||||||
|
base_img, mon = _capture_screen()
|
||||||
|
if source == "screen":
|
||||||
|
image = base_img
|
||||||
|
region = {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}
|
||||||
|
offset_x = mon["x"]
|
||||||
|
offset_y = mon["y"]
|
||||||
|
else:
|
||||||
|
left = req.region_x - mon["x"]
|
||||||
|
top = req.region_y - mon["y"]
|
||||||
|
right = left + req.region_width
|
||||||
|
bottom = top + req.region_height
|
||||||
|
|
||||||
|
if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
|
||||||
|
raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")
|
||||||
|
|
||||||
|
image = base_img.crop((left, top, right, bottom))
|
||||||
|
region = {
|
||||||
|
"x": req.region_x,
|
||||||
|
"y": req.region_y,
|
||||||
|
"width": req.region_width,
|
||||||
|
"height": req.region_height,
|
||||||
|
}
|
||||||
|
offset_x = req.region_x
|
||||||
|
offset_y = req.region_y
|
||||||
|
|
||||||
|
blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ok": True,
|
||||||
|
"request_id": _request_id(),
|
||||||
|
"time_ms": _now_ms(),
|
||||||
|
"result": {
|
||||||
|
"mode": source,
|
||||||
|
"language_hint": req.language_hint,
|
||||||
|
"min_confidence": req.min_confidence,
|
||||||
|
"region": region,
|
||||||
|
"blocks": blocks,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/batch")
|
@app.post("/batch")
|
||||||
def batch(req: BatchRequest, _: None = Depends(_auth)):
|
def batch(req: BatchRequest, _: None = Depends(_auth)):
|
||||||
results = []
|
results = []
|
||||||
|
|||||||
Reference in New Issue
Block a user