feat: migrate to v2-only API and unified response envelope
All checks were successful
python-syntax / syntax-check (push) Successful in 7s

This commit is contained in:
2026-05-03 19:11:11 +02:00
parent 2585bc3a7c
commit aced5be25e
5 changed files with 603 additions and 1267 deletions

View File

@@ -8,10 +8,12 @@ import subprocess
import sys
import time
import uuid
from typing import Literal, Optional
from typing import Any, Literal, Optional
from dotenv import load_dotenv
from fastapi import Depends, FastAPI, Header, HTTPException, Response
from fastapi import Depends, FastAPI, Header, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from PIL import ImageChops, ImageStat
from pydantic import BaseModel, Field, model_validator
@@ -21,6 +23,55 @@ load_dotenv(dotenv_path=".env", override=False)
app = FastAPI(title="clickthrough", version="0.1.0")
def _ok(data: Any, status_code: int = 200):
return JSONResponse(
status_code=status_code,
content={
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"data": data,
"error": None,
},
)
def _err(code: str, message: str, status_code: int, details: Any = None):
return JSONResponse(
status_code=status_code,
content={
"ok": False,
"request_id": _request_id(),
"time_ms": _now_ms(),
"data": None,
"error": {
"code": code,
"message": message,
"details": details,
},
},
)
@app.exception_handler(HTTPException)
async def _http_exception_handler(_: Request, exc: HTTPException):
detail = exc.detail
if isinstance(detail, dict):
message = str(detail.get("message", "request failed"))
return _err("http_error", message, exc.status_code, detail)
return _err("http_error", str(detail), exc.status_code)
@app.exception_handler(Exception)
async def _unhandled_exception_handler(_: Request, exc: Exception):
return _err("internal_error", "internal server error", 500, {"type": type(exc).__name__})
@app.exception_handler(RequestValidationError)
async def _validation_exception_handler(_: Request, exc: RequestValidationError):
return _err("validation_error", "request validation failed", 422, exc.errors())
def _env_bool(name: str, default: bool) -> bool:
raw = os.getenv(name)
if raw is None:
@@ -288,6 +339,144 @@ class VerifyActionRequest(BaseModel):
stop_on_action_error: bool = True
class ObserveRequestV2(BaseModel):
mode: Literal["screen", "region"] = "screen"
region_x: int | None = Field(default=None, ge=0)
region_y: int | None = Field(default=None, ge=0)
region_width: int | None = Field(default=None, gt=0)
region_height: int | None = Field(default=None, gt=0)
include_image: bool = True
image_format: Literal["png", "jpeg"] = "jpeg"
jpeg_quality: int = Field(default=75, ge=1, le=100)
ocr_mode: Literal["none", "region", "screen"] = "none"
language_hint: str | None = Field(default=None, min_length=1, max_length=64)
min_confidence: float = Field(default=0.4, ge=0.0, le=1.0)
max_ocr_area_px: int | None = Field(default=1_500_000, ge=1000)
group_lines: bool = True
@model_validator(mode="after")
def _validate_region(self):
if self.mode == "region":
required = [self.region_x, self.region_y, self.region_width, self.region_height]
if any(v is None for v in required):
raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
return self
class ImageToolPoint(BaseModel):
x: int = Field(ge=0)
y: int = Field(ge=0)
class LocalizeRequestV2(BaseModel):
observation_id: str = Field(min_length=1, max_length=128)
text_query: str | None = Field(default=None, max_length=512)
text_match: Literal["contains", "exact", "regex"] = "contains"
image_tool_point: ImageToolPoint | None = None
candidate_index: int = Field(default=0, ge=0)
@model_validator(mode="after")
def _validate_selector(self):
has_text = bool((self.text_query or "").strip())
has_point = self.image_tool_point is not None
if has_text == has_point:
raise ValueError("provide exactly one of text_query or image_tool_point")
return self
class ActionTargetV2(BaseModel):
resolved_target_id: str | None = Field(default=None, max_length=128)
pixel_x: int | None = None
pixel_y: int | None = None
@model_validator(mode="after")
def _validate_shape(self):
has_resolved = bool(self.resolved_target_id)
has_pixel = self.pixel_x is not None or self.pixel_y is not None
if has_resolved == has_pixel:
raise ValueError("provide either resolved_target_id or pixel_x/pixel_y")
if has_pixel and (self.pixel_x is None or self.pixel_y is None):
raise ValueError("pixel_x and pixel_y are both required")
return self
class ActionRequestV2(BaseModel):
action: Literal[
"move",
"click",
"right_click",
"double_click",
"middle_click",
"scroll",
"type",
"hotkey",
]
target: ActionTargetV2 | None = None
duration_ms: int = Field(default=0, ge=0, le=20000)
button: Literal["left", "right", "middle"] = "left"
clicks: int = Field(default=1, ge=1, le=10)
scroll_amount: int = 0
text: str = ""
keys: list[str] = Field(default_factory=list)
interval_ms: int = Field(default=20, ge=0, le=5000)
dry_run: bool = False
class ActRequestV2(BaseModel):
action: ActionRequestV2
class ActVerifyRequestV2(BaseModel):
action: ActionRequestV2
condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
risk_level: Literal["low", "high"] = "low"
retries: int | None = Field(default=None, ge=0, le=10)
timeout_ms: int | None = Field(default=None, ge=0, le=120000)
poll_interval_ms: int | None = Field(default=None, ge=50, le=10000)
retry_delay_ms: int | None = Field(default=None, ge=0, le=60000)
stop_on_action_error: bool = True
OBSERVATIONS: dict[str, dict[str, Any]] = {}
RESOLVED_TARGETS: dict[str, dict[str, Any]] = {}
def _get_observation(observation_id: str) -> dict[str, Any]:
observation = OBSERVATIONS.get(observation_id)
if observation is None:
raise HTTPException(status_code=404, detail="observation_id not found")
return observation
def _resolve_v2_action(req: ActionRequestV2) -> ActionRequest:
target: Target | None = None
if req.target is not None:
if req.target.resolved_target_id:
item = RESOLVED_TARGETS.get(req.target.resolved_target_id)
if item is None:
raise HTTPException(status_code=404, detail="resolved_target_id not found")
target = PixelTarget(mode="pixel", x=item["x"], y=item["y"], dx=0, dy=0)
else:
target = PixelTarget(mode="pixel", x=req.target.pixel_x or 0, y=req.target.pixel_y or 0, dx=0, dy=0)
return ActionRequest(
action=req.action,
target=target,
duration_ms=req.duration_ms,
button=req.button,
clicks=req.clicks,
scroll_amount=req.scroll_amount,
text=req.text,
keys=req.keys,
interval_ms=req.interval_ms,
dry_run=req.dry_run,
)
def _risk_defaults(risk_level: str) -> dict[str, int]:
if risk_level == "high":
return {"retries": 1, "timeout_ms": 6000, "poll_interval_ms": 250, "retry_delay_ms": 300}
return {"retries": 0, "timeout_ms": 2500, "poll_interval_ms": 200, "retry_delay_ms": 150}
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
token = SETTINGS["token"]
@@ -1377,154 +1566,225 @@ def _exec_action(req: ActionRequest, screen: int = 0) -> dict:
}
def _localization_confidence(source: str, confidence: float | None = None) -> str:
if source == "image_tool_point":
return "high"
if source == "ocr" and confidence is not None:
if confidence >= 0.8:
return "high"
if confidence >= 0.55:
return "medium"
return "low"
@app.post("/v2/observe")
def observe_v2(req: ObserveRequestV2, screen: int = 0, _: None = Depends(_auth)):
capture_started = time.perf_counter()
image, region, mon, displays, screen_selection = _capture_region_image(
screen,
req.region_x if req.mode == "region" else None,
req.region_y if req.mode == "region" else None,
req.region_width if req.mode == "region" else None,
req.region_height if req.mode == "region" else None,
)
capture_ms = int((time.perf_counter() - capture_started) * 1000)
encoded = None
if req.include_image:
encoded = _encode_image(image, req.image_format, req.jpeg_quality)
ocr_started = time.perf_counter()
blocks: list[dict] = []
grouped_lines: list[dict] = []
ocr_applied_mode = "none"
if req.ocr_mode != "none":
if req.ocr_mode == "screen":
ocr_image, ocr_region, _, _, _ = _capture_region_image(screen, None, None, None, None)
else:
ocr_image, ocr_region = image, region
area = ocr_region["width"] * ocr_region["height"]
if req.max_ocr_area_px is not None and area > req.max_ocr_area_px:
raise HTTPException(
status_code=400,
detail=f"ocr area {area} exceeds max_ocr_area_px {req.max_ocr_area_px}",
)
blocks = _run_ocr(
ocr_image,
req.language_hint,
req.min_confidence,
ocr_region["x"],
ocr_region["y"],
)
if req.group_lines:
grouped_lines = _group_ocr_lines(blocks)
ocr_applied_mode = req.ocr_mode
ocr_ms = int((time.perf_counter() - ocr_started) * 1000)
observation_id = _request_id()
OBSERVATIONS[observation_id] = {
"id": observation_id,
"region": region,
"screen": screen_selection,
"display": mon,
"image_width": image.size[0],
"image_height": image.size[1],
"ocr_blocks": blocks,
"ocr_lines": grouped_lines,
"created_at_ms": _now_ms(),
}
return _ok(
{
"observation_id": observation_id,
"region": region,
"screen": screen_selection,
"display": mon,
"image": {
"included": req.include_image,
"format": req.image_format if req.include_image else None,
"base64": encoded,
"width": image.size[0],
"height": image.size[1],
},
"ocr": {
"mode": ocr_applied_mode,
"min_confidence": req.min_confidence,
"language_hint": req.language_hint,
"block_count": len(blocks),
"line_count": len(grouped_lines),
"blocks": blocks,
"lines": grouped_lines,
},
"timing_ms": {
"capture_ms": capture_ms,
"ocr_ms": ocr_ms if req.ocr_mode != "none" else 0,
"total_ms": capture_ms + (ocr_ms if req.ocr_mode != "none" else 0),
},
}
)
@app.post("/v2/localize")
def localize_v2(req: LocalizeRequestV2, _: None = Depends(_auth)):
observation = _get_observation(req.observation_id)
region = observation["region"]
image_width = observation["image_width"]
image_height = observation["image_height"]
if req.image_tool_point is not None:
if req.image_tool_point.x >= image_width or req.image_tool_point.y >= image_height:
raise HTTPException(status_code=400, detail="image_tool_point outside observation image bounds")
x = region["x"] + req.image_tool_point.x
y = region["y"] + req.image_tool_point.y
_enforce_allowed_region(x, y)
resolved_target_id = _request_id()
RESOLVED_TARGETS[resolved_target_id] = {
"id": resolved_target_id,
"observation_id": req.observation_id,
"x": x,
"y": y,
"source": "image_tool_point",
}
return _ok(
{
"resolved_target_id": resolved_target_id,
"source": "image_tool_point",
"localization_confidence": _localization_confidence("image_tool_point"),
"pixel": {"x": x, "y": y},
"observation_region": region,
"image_bounds": {"width": image_width, "height": image_height},
}
)
lines = observation.get("ocr_lines") or _group_ocr_lines(observation.get("ocr_blocks", []))
matches = _find_text_matches(lines, req.text_query or "", req.text_match, False, 200)
if not matches:
return _err("not_found", "no localization candidates found", 404, {"found": False, "matches": []})
if req.candidate_index >= len(matches):
raise HTTPException(status_code=400, detail="candidate_index is outside match results")
chosen = matches[req.candidate_index]
bbox = chosen["bbox"]
x = bbox["x"] + max(1, bbox["width"] // 2)
y = bbox["y"] + max(1, bbox["height"] // 2)
_enforce_allowed_region(x, y)
resolved_target_id = _request_id()
RESOLVED_TARGETS[resolved_target_id] = {
"id": resolved_target_id,
"observation_id": req.observation_id,
"x": x,
"y": y,
"source": "ocr",
"match": chosen,
}
return _ok(
{
"resolved_target_id": resolved_target_id,
"source": "ocr",
"localization_confidence": _localization_confidence("ocr", chosen.get("confidence")),
"pixel": {"x": x, "y": y},
"selected_match": chosen,
"match_count": len(matches),
}
)
@app.post("/v2/act")
def act_v2(req: ActRequestV2, screen: int = 0, _: None = Depends(_auth)):
legacy_action = _resolve_v2_action(req.action)
result = _exec_action(legacy_action, screen)
return _ok(result)
@app.post("/v2/act-verify")
def act_verify_v2(req: ActVerifyRequestV2, screen: int = 0, _: None = Depends(_auth)):
defaults = _risk_defaults(req.risk_level)
verify_req = VerifyActionRequest(
action=_resolve_v2_action(req.action),
condition=req.condition,
retries=defaults["retries"] if req.retries is None else req.retries,
timeout_ms=defaults["timeout_ms"] if req.timeout_ms is None else req.timeout_ms,
poll_interval_ms=defaults["poll_interval_ms"] if req.poll_interval_ms is None else req.poll_interval_ms,
retry_delay_ms=defaults["retry_delay_ms"] if req.retry_delay_ms is None else req.retry_delay_ms,
stop_on_action_error=req.stop_on_action_error,
)
result = _run_verified_action(verify_req, screen)
payload = {
"risk_level": req.risk_level,
"defaults_applied": defaults,
**result,
}
if result.get("success", False):
return _ok(payload)
return _err("verification_failed", "action verification did not satisfy condition", 409, payload)
@app.get("/health")
def health(_: None = Depends(_auth)):
return {
"ok": True,
"service": "clickthrough",
"version": app.version,
"time_ms": _now_ms(),
"request_id": _request_id(),
"dry_run": SETTINGS["dry_run"],
"allowed_region": SETTINGS["allowed_region"],
"exec": {
"enabled": SETTINGS["exec_enabled"],
"secret_configured": bool(SETTINGS["exec_secret"]),
"default_shell": SETTINGS["exec_default_shell"],
"default_timeout_s": SETTINGS["exec_default_timeout_s"],
"max_timeout_s": SETTINGS["exec_max_timeout_s"],
},
}
return _ok(
{
"service": "clickthrough",
"version": app.version,
"dry_run": SETTINGS["dry_run"],
"allowed_region": SETTINGS["allowed_region"],
"exec": {
"enabled": SETTINGS["exec_enabled"],
"secret_configured": bool(SETTINGS["exec_secret"]),
"default_shell": SETTINGS["exec_default_shell"],
"default_timeout_s": SETTINGS["exec_default_timeout_s"],
"max_timeout_s": SETTINGS["exec_max_timeout_s"],
},
}
)
@app.get("/displays")
def displays(_: None = Depends(_auth)):
detected = _get_displays()
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"displays": detected,
"default_screen": 0,
}
@app.get("/screen")
def screen(
with_grid: bool = True,
grid_rows: int = SETTINGS["default_grid_rows"],
grid_cols: int = SETTINGS["default_grid_cols"],
include_labels: bool = True,
image_format: Literal["png", "jpeg"] = "png",
jpeg_quality: int = 85,
asImage: bool = False,
screen: int = 0,
_: None = Depends(_auth),
):
req = ScreenRequest(
with_grid=with_grid,
grid_rows=grid_rows,
grid_cols=grid_cols,
include_labels=include_labels,
image_format=image_format,
jpeg_quality=jpeg_quality,
)
base_img, mon, displays, screen_selection = _capture_screen(screen)
meta = {"region": mon, "screen": screen_selection, "displays": displays}
out_img = base_img
if req.with_grid:
out_img, grid_meta = _draw_grid(base_img, mon["x"], mon["y"], req.grid_rows, req.grid_cols, req.include_labels)
meta.update(grid_meta)
if asImage:
image_bytes = _serialize_image(out_img, req.image_format, req.jpeg_quality)
media_type = "image/jpeg" if req.image_format == "jpeg" else "image/png"
return Response(content=image_bytes, media_type=media_type)
encoded = _encode_image(out_img, req.image_format, req.jpeg_quality)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"image": {
"format": req.image_format,
"base64": encoded,
"width": out_img.size[0],
"height": out_img.size[1],
},
"meta": meta,
}
@app.post("/zoom")
def zoom(req: ZoomRequest, asImage: bool = False, screen: int = 0, _: None = Depends(_auth)):
base_img, mon, displays, screen_selection = _capture_screen(screen)
cx = req.center_x - mon["x"]
cy = req.center_y - mon["y"]
half_w = req.width // 2
half_h = req.height // 2
left = max(0, cx - half_w)
top = max(0, cy - half_h)
right = min(base_img.size[0], left + req.width)
bottom = min(base_img.size[1], top + req.height)
crop = base_img.crop((left, top, right, bottom))
region_x = mon["x"] + left
region_y = mon["y"] + top
meta = {
"source_monitor": mon,
"screen": screen_selection,
"displays": displays,
"region": {
"x": region_x,
"y": region_y,
"width": crop.size[0],
"height": crop.size[1],
},
}
out_img = crop
if req.with_grid:
out_img, grid_meta = _draw_grid(crop, region_x, region_y, req.grid_rows, req.grid_cols, req.include_labels)
meta.update(grid_meta)
if asImage:
image_bytes = _serialize_image(out_img, req.image_format, req.jpeg_quality)
media_type = "image/jpeg" if req.image_format == "jpeg" else "image/png"
return Response(content=image_bytes, media_type=media_type)
encoded = _encode_image(out_img, req.image_format, req.jpeg_quality)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"image": {
"format": req.image_format,
"base64": encoded,
"width": out_img.size[0],
"height": out_img.size[1],
},
"meta": meta,
}
@app.post("/action")
def action(req: ActionRequest, screen: int = 0, _: None = Depends(_auth)):
result = _exec_action(req, screen)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
return _ok({"displays": detected, "default_screen": 0})
@app.post("/exec")
@@ -1540,12 +1800,7 @@ def exec_command(
raise HTTPException(status_code=401, detail="invalid exec secret")
result = _exec_command(req)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
return _ok(result)
@app.get("/windows")
@@ -1565,151 +1820,19 @@ def windows(
visible_only=visible_only,
)
matches = _list_windows(query)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"windows": matches,
"count": len(matches),
}
return _ok({"windows": matches, "count": len(matches)})
@app.post("/windows/action")
def window_action(req: WindowActionRequest, _: None = Depends(_auth)):
result = _apply_window_action(req)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
return _ok(result)
@app.post("/launch")
def launch(req: LaunchRequest, _: None = Depends(_auth)):
result = _launch_app(req)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/wait")
def wait(req: WaitRequest, screen: int = 0, _: None = Depends(_auth)):
result = _wait_for_condition(req, screen)
return {
"ok": result.get("satisfied", False),
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/vision/diff")
def vision_diff(req: VisionDiffRequest, screen: int = 0, _: None = Depends(_auth)):
result = _compute_visual_diff(req, screen)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/vision/stability")
def vision_stability(req: VisionStabilityRequest, screen: int = 0, _: None = Depends(_auth)):
result = _measure_stability(req, screen)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/action/verify")
def action_verify(req: VerifyActionRequest, screen: int = 0, _: None = Depends(_auth)):
result = _run_verified_action(req, screen)
return {
"ok": result.get("success", False),
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/ocr")
def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen)
offset_x = region["x"] if source != "image" else 0
offset_y = region["y"] if source != "image" else 0
blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": {
"mode": source,
"screen": screen_selection if source != "image" else None,
"display": mon if source != "image" else None,
"language_hint": req.language_hint,
"min_confidence": req.min_confidence,
"region": region,
"blocks": blocks,
},
}
@app.post("/ocr/find")
def ocr_find(req: OCRFindRequest, screen: int = 0, _: None = Depends(_auth)):
image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen)
offset_x = region["x"] if source != "image" else 0
offset_y = region["y"] if source != "image" else 0
blocks = _run_ocr(image, req.language_hint, req.min_confidence, offset_x, offset_y)
matches = _find_text_matches(blocks, req.query, req.match, req.group_lines, req.max_results)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": {
"mode": source,
"screen": screen_selection if source != "image" else None,
"display": mon if source != "image" else None,
"language_hint": req.language_hint,
"min_confidence": req.min_confidence,
"query": req.query,
"match": req.match,
"group_lines": req.group_lines,
"region": region,
"matches": matches,
"match_count": len(matches),
"blocks_considered": len(blocks),
},
}
@app.post("/batch")
def batch(req: BatchRequest, screen: int = 0, _: None = Depends(_auth)):
results = []
for index, item in enumerate(req.actions):
try:
item_result = _exec_action(item, screen)
results.append({"index": index, "ok": True, "result": item_result})
except Exception as exc:
results.append({"index": index, "ok": False, "error": str(exc)})
if req.stop_on_error:
break
return {
"ok": all(r["ok"] for r in results),
"request_id": _request_id(),
"time_ms": _now_ms(),
"results": results,
}
return _ok(result)
if __name__ == "__main__":