clickthrough/server/app.py

import base64
import ctypes
import hmac
import io
import os
import re
import subprocess
import sys
import time
import uuid
from typing import Any, Literal, Optional

from dotenv import load_dotenv
from fastapi import Depends, FastAPI, Header, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse
from PIL import ImageChops, ImageStat
from pydantic import BaseModel, Field, model_validator


load_dotenv(dotenv_path=".env", override=False)

app = FastAPI(title="clickthrough", version="0.1.0")


def _ok(data: Any, status_code: int = 200):
    return JSONResponse(
        status_code=status_code,
        content={
            "ok": True,
            "request_id": _request_id(),
            "time_ms": _now_ms(),
            "data": data,
            "error": None,
        },
    )


def _err(code: str, message: str, status_code: int, details: Any = None):
    return JSONResponse(
        status_code=status_code,
        content={
            "ok": False,
            "request_id": _request_id(),
            "time_ms": _now_ms(),
            "data": None,
            "error": {
                "code": code,
                "message": message,
                "details": details,
            },
        },
    )


@app.exception_handler(HTTPException)
async def _http_exception_handler(_: Request, exc: HTTPException):
    detail = exc.detail
    if isinstance(detail, dict):
        message = str(detail.get("message", "request failed"))
        return _err("http_error", message, exc.status_code, detail)
    return _err("http_error", str(detail), exc.status_code)


@app.exception_handler(Exception)
async def _unhandled_exception_handler(_: Request, exc: Exception):
    return _err("internal_error", "internal server error", 500, {"type": type(exc).__name__})


@app.exception_handler(RequestValidationError)
async def _validation_exception_handler(_: Request, exc: RequestValidationError):
    return _err("validation_error", "request validation failed", 422, exc.errors())


def _env_bool(name: str, default: bool) -> bool:
    raw = os.getenv(name)
    if raw is None:
        return default
    return raw.strip().lower() in {"1", "true", "yes", "on"}


def _parse_allowed_region() -> Optional[tuple[int, int, int, int]]:
    raw = os.getenv("CLICKTHROUGH_ALLOWED_REGION")
    if not raw:
        return None
    parts = [p.strip() for p in raw.split(",")]
    if len(parts) != 4:
        raise ValueError("CLICKTHROUGH_ALLOWED_REGION must be x,y,width,height")
    x, y, w, h = (int(p) for p in parts)
    if w <= 0 or h <= 0:
        raise ValueError("CLICKTHROUGH_ALLOWED_REGION width/height must be > 0")
    return x, y, w, h


SETTINGS = {
    "host": os.getenv("CLICKTHROUGH_HOST", "127.0.0.1"),
    "port": int(os.getenv("CLICKTHROUGH_PORT", "8123")),
    "token": os.getenv("CLICKTHROUGH_TOKEN", "").strip(),
    "dry_run": _env_bool("CLICKTHROUGH_DRY_RUN", False),
    "default_grid_rows": int(os.getenv("CLICKTHROUGH_GRID_ROWS", "12")),
    "default_grid_cols": int(os.getenv("CLICKTHROUGH_GRID_COLS", "12")),
    "allowed_region": _parse_allowed_region(),
    "exec_enabled": _env_bool("CLICKTHROUGH_EXEC_ENABLED", True),
    "exec_default_shell": os.getenv("CLICKTHROUGH_EXEC_DEFAULT_SHELL", "powershell").strip().lower(),
    "exec_default_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_TIMEOUT_S", "30")),
    "exec_max_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_MAX_TIMEOUT_S", "120")),
    "exec_max_output_chars": int(os.getenv("CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS", "20000")),
    "exec_secret": os.getenv("CLICKTHROUGH_EXEC_SECRET", "").strip(),
    "tesseract_cmd": os.getenv("CLICKTHROUGH_TESSERACT_CMD", "").strip(),
}


class ScreenRequest(BaseModel):
    with_grid: bool = True
    grid_rows: int = Field(default=SETTINGS["default_grid_rows"], ge=1, le=200)
    grid_cols: int = Field(default=SETTINGS["default_grid_cols"], ge=1, le=200)
    include_labels: bool = True
    image_format: Literal["png", "jpeg"] = "png"
    jpeg_quality: int = Field(default=85, ge=1, le=100)


class ZoomRequest(BaseModel):
    center_x: int = Field(ge=0)
    center_y: int = Field(ge=0)
    width: int = Field(default=500, ge=10)
    height: int = Field(default=350, ge=10)
    with_grid: bool = True
    grid_rows: int = Field(default=20, ge=1, le=300)
    grid_cols: int = Field(default=20, ge=1, le=300)
    include_labels: bool = True
    image_format: Literal["png", "jpeg"] = "png"
    jpeg_quality: int = Field(default=90, ge=1, le=100)


class PixelTarget(BaseModel):
    mode: Literal["pixel"]
    x: int
    y: int
    dx: int = 0
    dy: int = 0


class GridTarget(BaseModel):
    mode: Literal["grid"]
    region_x: int
    region_y: int
    region_width: int = Field(gt=0)
    region_height: int = Field(gt=0)
    rows: int = Field(gt=0)
    cols: int = Field(gt=0)
    row: int = Field(ge=0)
    col: int = Field(ge=0)
    dx: float = 0.0
    dy: float = 0.0

    @model_validator(mode="after")
    def _validate_indices(self):
        if self.row >= self.rows or self.col >= self.cols:
            raise ValueError("row/col must be inside rows/cols")
        if not -1.0 <= self.dx <= 1.0:
            raise ValueError("dx must be in [-1, 1]")
        if not -1.0 <= self.dy <= 1.0:
            raise ValueError("dy must be in [-1, 1]")
        return self


Target = PixelTarget | GridTarget


class ActionRequest(BaseModel):
    action: Literal[
        "move",
        "click",
        "right_click",
        "double_click",
        "middle_click",
        "scroll",
        "type",
        "hotkey",
    ]
    target: Optional[Target] = None
    duration_ms: int = Field(default=0, ge=0, le=20000)
    button: Literal["left", "right", "middle"] = "left"
    clicks: int = Field(default=1, ge=1, le=10)
    scroll_amount: int = 0
    text: str = ""
    keys: list[str] = Field(default_factory=list)
    interval_ms: int = Field(default=20, ge=0, le=5000)
    dry_run: bool = False


class BatchRequest(BaseModel):
    actions: list[ActionRequest] = Field(min_length=1, max_length=100)
    stop_on_error: bool = True


class ExecRequest(BaseModel):
    command: str = Field(min_length=1, max_length=10000)
    shell: Literal["powershell", "bash", "cmd"] | None = None
    timeout_s: int | None = Field(default=None, ge=1, le=600)
    cwd: str | None = None
    dry_run: bool = False


class OCRRequest(BaseModel):
    mode: Literal["screen", "region", "image"] = "screen"
    region_x: int | None = Field(default=None, ge=0)
    region_y: int | None = Field(default=None, ge=0)
    region_width: int | None = Field(default=None, gt=0)
    region_height: int | None = Field(default=None, gt=0)
    image_base64: str | None = None
    language_hint: str | None = Field(default=None, min_length=1, max_length=64)
    min_confidence: float = Field(default=0.0, ge=0.0, le=1.0)

    @model_validator(mode="after")
    def _validate_mode_inputs(self):
        if self.mode == "region":
            required = [self.region_x, self.region_y, self.region_width, self.region_height]
            if any(v is None for v in required):
                raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
        if self.mode == "image" and not self.image_base64:
            raise ValueError("image_base64 is required for mode=image")
        return self


class WindowQuery(BaseModel):
    title_contains: str | None = Field(default=None, max_length=512)
    title_regex: str | None = Field(default=None, max_length=512)
    process_name: str | None = Field(default=None, max_length=260)
    hwnd: int | None = Field(default=None, ge=1)
    visible_only: bool = True


class WindowActionRequest(WindowQuery):
    action: Literal["focus", "restore", "minimize", "maximize", "close"]
    timeout_ms: int = Field(default=3000, ge=0, le=60000)


class LaunchRequest(BaseModel):
    executable: str = Field(min_length=1, max_length=2048)
    args: list[str] = Field(default_factory=list, max_length=100)
    cwd: str | None = None
    wait_for_window: bool = False
    match: WindowQuery | None = None
    timeout_ms: int = Field(default=5000, ge=0, le=120000)
    dry_run: bool = False


class WaitTextCondition(BaseModel):
    kind: Literal["text"]
    mode: Literal["screen", "region"] = "screen"
    text: str = Field(min_length=1, max_length=512)
    match: Literal["contains", "exact", "regex"] = "contains"
    present: bool = True
    region_x: int | None = Field(default=None, ge=0)
    region_y: int | None = Field(default=None, ge=0)
    region_width: int | None = Field(default=None, gt=0)
    region_height: int | None = Field(default=None, gt=0)
    language_hint: str | None = Field(default=None, min_length=1, max_length=64)
    min_confidence: float = Field(default=0.0, ge=0.0, le=1.0)

    @model_validator(mode="after")
    def _validate_region(self):
        if self.mode == "region":
            required = [self.region_x, self.region_y, self.region_width, self.region_height]
            if any(v is None for v in required):
                raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
        return self


class WaitWindowCondition(WindowQuery):
    kind: Literal["window"]
    state: Literal["exists", "focused", "closed"] = "exists"


class WaitVisualCondition(BaseModel):
    kind: Literal["visual"]
    state: Literal["change", "stable"] = "change"
    region_x: int | None = Field(default=None, ge=0)
    region_y: int | None = Field(default=None, ge=0)
    region_width: int | None = Field(default=None, gt=0)
    region_height: int | None = Field(default=None, gt=0)
    diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)
    stable_for_ms: int = Field(default=800, ge=0, le=60000)


class WaitRequest(BaseModel):
    condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
    timeout_ms: int = Field(default=5000, ge=0, le=120000)
    poll_interval_ms: int = Field(default=250, ge=50, le=10000)


class OCRFindRequest(OCRRequest):
    query: str = Field(min_length=1, max_length=512)
    match: Literal["contains", "exact", "regex"] = "contains"
    group_lines: bool = True
    max_results: int = Field(default=20, ge=1, le=200)


class VisionDiffRequest(BaseModel):
    mode: Literal["screen", "region", "image"] = "screen"
    region_x: int | None = Field(default=None, ge=0)
    region_y: int | None = Field(default=None, ge=0)
    region_width: int | None = Field(default=None, gt=0)
    region_height: int | None = Field(default=None, gt=0)
    before_image_base64: str | None = None
    after_image_base64: str | None = None
    delay_ms: int = Field(default=300, ge=0, le=60000)
    diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)

    @model_validator(mode="after")
    def _validate_inputs(self):
        if self.mode == "region":
            required = [self.region_x, self.region_y, self.region_width, self.region_height]
            if any(v is None for v in required):
                raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
        if self.mode == "image" and (not self.before_image_base64 or not self.after_image_base64):
            raise ValueError("before_image_base64 and after_image_base64 are required for mode=image")
        return self


class VisionStabilityRequest(BaseModel):
    region_x: int | None = Field(default=None, ge=0)
    region_y: int | None = Field(default=None, ge=0)
    region_width: int | None = Field(default=None, gt=0)
    region_height: int | None = Field(default=None, gt=0)
    sample_interval_ms: int = Field(default=250, ge=50, le=10000)
    duration_ms: int = Field(default=1200, ge=0, le=120000)
    diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)


class VerifyActionRequest(BaseModel):
    action: ActionRequest
    condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
    retries: int = Field(default=0, ge=0, le=10)
    timeout_ms: int = Field(default=5000, ge=0, le=120000)
    poll_interval_ms: int = Field(default=250, ge=50, le=10000)
    retry_delay_ms: int = Field(default=200, ge=0, le=60000)
    stop_on_action_error: bool = True


class ObserveRequestV2(BaseModel):
    mode: Literal["screen", "region"] = "screen"
    region_x: int | None = Field(default=None, ge=0)
    region_y: int | None = Field(default=None, ge=0)
    region_width: int | None = Field(default=None, gt=0)
    region_height: int | None = Field(default=None, gt=0)
    include_image: bool = True
    image_format: Literal["png", "jpeg"] = "jpeg"
    jpeg_quality: int = Field(default=75, ge=1, le=100)
    ocr_mode: Literal["none", "region", "screen"] = "none"
    language_hint: str | None = Field(default=None, min_length=1, max_length=64)
    min_confidence: float = Field(default=0.4, ge=0.0, le=1.0)
    max_ocr_area_px: int | None = Field(default=1_500_000, ge=1000)
    group_lines: bool = True

    @model_validator(mode="after")
    def _validate_region(self):
        if self.mode == "region":
            required = [self.region_x, self.region_y, self.region_width, self.region_height]
            if any(v is None for v in required):
                raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
        return self


class ImageToolPoint(BaseModel):
    x: int = Field(ge=0)
    y: int = Field(ge=0)


class LocalizeRequestV2(BaseModel):
    observation_id: str = Field(min_length=1, max_length=128)
    text_query: str | None = Field(default=None, max_length=512)
    text_match: Literal["contains", "exact", "regex"] = "contains"
    image_tool_point: ImageToolPoint | None = None
    candidate_index: int = Field(default=0, ge=0)

    @model_validator(mode="after")
    def _validate_selector(self):
        has_text = bool((self.text_query or "").strip())
        has_point = self.image_tool_point is not None
        if has_text == has_point:
            raise ValueError("provide exactly one of text_query or image_tool_point")
        return self


class ActionTargetV2(BaseModel):
    resolved_target_id: str | None = Field(default=None, max_length=128)
    pixel_x: int | None = None
    pixel_y: int | None = None

    @model_validator(mode="after")
    def _validate_shape(self):
        has_resolved = bool(self.resolved_target_id)
        has_pixel = self.pixel_x is not None or self.pixel_y is not None
        if has_resolved == has_pixel:
            raise ValueError("provide either resolved_target_id or pixel_x/pixel_y")
        if has_pixel and (self.pixel_x is None or self.pixel_y is None):
            raise ValueError("pixel_x and pixel_y are both required")
        return self


class ActionRequestV2(BaseModel):
    action: Literal[
        "move",
        "click",
        "right_click",
        "double_click",
        "middle_click",
        "scroll",
        "type",
        "hotkey",
    ]
    target: ActionTargetV2 | None = None
    duration_ms: int = Field(default=0, ge=0, le=20000)
    button: Literal["left", "right", "middle"] = "left"
    clicks: int = Field(default=1, ge=1, le=10)
    scroll_amount: int = 0
    text: str = ""
    keys: list[str] = Field(default_factory=list)
    interval_ms: int = Field(default=20, ge=0, le=5000)
    dry_run: bool = False


class ActRequestV2(BaseModel):
    action: ActionRequestV2


class ActVerifyRequestV2(BaseModel):
    action: ActionRequestV2
    condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
    risk_level: Literal["low", "high"] = "low"
    retries: int | None = Field(default=None, ge=0, le=10)
    timeout_ms: int | None = Field(default=None, ge=0, le=120000)
    poll_interval_ms: int | None = Field(default=None, ge=50, le=10000)
    retry_delay_ms: int | None = Field(default=None, ge=0, le=60000)
    stop_on_action_error: bool = True


OBSERVATIONS: dict[str, dict[str, Any]] = {}
RESOLVED_TARGETS: dict[str, dict[str, Any]] = {}


def _get_observation(observation_id: str) -> dict[str, Any]:
    observation = OBSERVATIONS.get(observation_id)
    if observation is None:
        raise HTTPException(status_code=404, detail="observation_id not found")
    return observation


def _resolve_v2_action(req: ActionRequestV2) -> ActionRequest:
    target: Target | None = None
    if req.target is not None:
        if req.target.resolved_target_id:
            item = RESOLVED_TARGETS.get(req.target.resolved_target_id)
            if item is None:
                raise HTTPException(status_code=404, detail="resolved_target_id not found")
            target = PixelTarget(mode="pixel", x=item["x"], y=item["y"], dx=0, dy=0)
        else:
            target = PixelTarget(mode="pixel", x=req.target.pixel_x or 0, y=req.target.pixel_y or 0, dx=0, dy=0)
    return ActionRequest(
        action=req.action,
        target=target,
        duration_ms=req.duration_ms,
        button=req.button,
        clicks=req.clicks,
        scroll_amount=req.scroll_amount,
        text=req.text,
        keys=req.keys,
        interval_ms=req.interval_ms,
        dry_run=req.dry_run,
    )


def _risk_defaults(risk_level: str) -> dict[str, int]:
    if risk_level == "high":
        return {"retries": 1, "timeout_ms": 6000, "poll_interval_ms": 250, "retry_delay_ms": 300}
    return {"retries": 0, "timeout_ms": 2500, "poll_interval_ms": 200, "retry_delay_ms": 150}


def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
    token = SETTINGS["token"]
    if token and x_clickthrough_token != token:
        raise HTTPException(status_code=401, detail="invalid token")


def _now_ms() -> int:
    return int(time.time() * 1000)


def _request_id() -> str:
    return str(uuid.uuid4())


def _import_capture_libs():
    try:
        from PIL import Image, ImageDraw
        import mss

        return Image, ImageDraw, mss
    except Exception as exc:
        raise HTTPException(status_code=500, detail=f"capture backend unavailable: {exc}") from exc


def _display_region(mon: dict, screen: int, mss_index: int, primary: bool) -> dict:
    return {
        "screen": screen,
        "mss_index": mss_index,
        "primary": primary,
        "x": mon["left"],
        "y": mon["top"],
        "width": mon["width"],
        "height": mon["height"],
    }


def _ordered_displays(sct) -> list[dict]:
    raw_monitors = list(enumerate(sct.monitors[1:], start=1))
    if not raw_monitors:
        raise HTTPException(status_code=500, detail="no displays detected")

    primary_pos = next(
        (idx for idx, (_, mon) in enumerate(raw_monitors) if mon["left"] == 0 and mon["top"] == 0),
        0,
    )
    ordered = [raw_monitors[primary_pos]] + [
        item for idx, item in enumerate(raw_monitors) if idx != primary_pos
    ]
    return [
        _display_region(mon, screen=index, mss_index=mss_index, primary=(index == 0))
        for index, (mss_index, mon) in enumerate(ordered)
    ]


def _get_displays() -> list[dict]:
    _, _, mss = _import_capture_libs()
    with mss.mss() as sct:
        return _ordered_displays(sct)


def _select_display(screen: int) -> tuple[dict, list[dict], dict]:
    displays = _get_displays()
    selected = displays[screen] if 0 <= screen < len(displays) else displays[0]
    selection = {
        "requested": screen,
        "selected": selected["screen"],
        "fallback": selected["screen"] != screen,
    }
    return selected, displays, selection


def _capture_screen(screen: int = 0):
    Image, _, mss = _import_capture_libs()
    with mss.mss() as sct:
        displays = _ordered_displays(sct)
        mon = displays[screen] if 0 <= screen < len(displays) else displays[0]
        shot = sct.grab(
            {
                "left": mon["x"],
                "top": mon["y"],
                "width": mon["width"],
                "height": mon["height"],
            }
        )
        image = Image.frombytes("RGB", shot.size, shot.rgb)
        selection = {
            "requested": screen,
            "selected": mon["screen"],
            "fallback": mon["screen"] != screen,
        }
        return image, mon, displays, selection


def _serialize_image(image, image_format: str, jpeg_quality: int) -> bytes:
    buf = io.BytesIO()
    if image_format == "jpeg":
        image.save(buf, format="JPEG", quality=jpeg_quality)
    else:
        image.save(buf, format="PNG")
    return buf.getvalue()


def _encode_image(image, image_format: str, jpeg_quality: int) -> str:
    return base64.b64encode(_serialize_image(image, image_format, jpeg_quality)).decode("ascii")


def _draw_grid(image, region_x: int, region_y: int, rows: int, cols: int, include_labels: bool):
    _, ImageDraw, _ = _import_capture_libs()
    out = image.copy()
    draw = ImageDraw.Draw(out)
    w, h = out.size

    cell_w = w / cols
    cell_h = h / rows

    for c in range(1, cols):
        x = int(round(c * cell_w))
        draw.line([(x, 0), (x, h)], fill=(255, 0, 0), width=1)
    for r in range(1, rows):
        y = int(round(r * cell_h))
        draw.line([(0, y), (w, y)], fill=(255, 0, 0), width=1)

    draw.rectangle([(0, 0), (w - 1, h - 1)], outline=(255, 0, 0), width=2)

    if include_labels:
        for r in range(rows):
            for c in range(cols):
                cx = int((c + 0.5) * cell_w)
                cy = int((r + 0.5) * cell_h)
                label = f"{r},{c}"
                draw.text((cx - 12, cy - 6), label, fill=(255, 255, 0))

    meta = {
        "region": {"x": region_x, "y": region_y, "width": w, "height": h},
        "grid": {
            "rows": rows,
            "cols": cols,
            "cell_width": cell_w,
            "cell_height": cell_h,
            "indexing": "zero-based",
            "point_formula": {
                "pixel_x": "region.x + ((col + 0.5 + dx*0.5) * cell_width)",
                "pixel_y": "region.y + ((row + 0.5 + dy*0.5) * cell_height)",
                "dx_range": "[-1,1]",
                "dy_range": "[-1,1]",
            },
        },
    }
    return out, meta


def _resolve_target(target: Target) -> tuple[int, int, dict]:
    if isinstance(target, PixelTarget):
        x = target.x + target.dx
        y = target.y + target.dy
        return x, y, {"mode": "pixel", "source": target.model_dump()}

    cell_w = target.region_width / target.cols
    cell_h = target.region_height / target.rows

    x = target.region_x + int(round((target.col + 0.5 + (target.dx * 0.5)) * cell_w))
    y = target.region_y + int(round((target.row + 0.5 + (target.dy * 0.5)) * cell_h))

    return x, y, {
        "mode": "grid",
        "source": target.model_dump(),
        "derived": {"cell_width": cell_w, "cell_height": cell_h},
    }


def _enforce_allowed_region(x: int, y: int):
    region = SETTINGS["allowed_region"]
    if region is None:
        return
    rx, ry, rw, rh = region
    if not (rx <= x < rx + rw and ry <= y < ry + rh):
        raise HTTPException(status_code=403, detail="point outside allowed region")


def _import_input_lib():
    try:
        import pyautogui

        pyautogui.FAILSAFE = True
        return pyautogui
    except Exception as exc:
        raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc


def _import_ocr_libs():
    try:
        import pytesseract
        from pytesseract import Output

        tesseract_cmd = SETTINGS["tesseract_cmd"]
        if tesseract_cmd:
            pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

        return pytesseract, Output
    except Exception as exc:
        raise HTTPException(status_code=500, detail=f"ocr backend unavailable: {exc}") from exc


def _decode_image_base64(value: str):
    try:
        from PIL import Image
    except Exception as exc:
        raise HTTPException(status_code=500, detail=f"image decode backend unavailable: {exc}") from exc

    payload = value.strip()
    if payload.startswith("data:"):
        parts = payload.split(",", 1)
        if len(parts) != 2:
            raise HTTPException(status_code=400, detail="invalid data URL image payload")
        payload = parts[1]

    try:
        image_bytes = base64.b64decode(payload, validate=True)
    except Exception as exc:
        raise HTTPException(status_code=400, detail="invalid image_base64 payload") from exc

    try:
        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
    except Exception as exc:
        raise HTTPException(status_code=400, detail="unsupported or unreadable image bytes") from exc

    return image


def _run_ocr(image, language_hint: str | None, min_confidence: float, offset_x: int = 0, offset_y: int = 0) -> list[dict]:
    pytesseract, Output = _import_ocr_libs()

    config = "--oem 3 --psm 6"
    kwargs = {
        "image": image,
        "output_type": Output.DICT,
        "config": config,
    }
    if language_hint:
        kwargs["lang"] = language_hint

    try:
        data = pytesseract.image_to_data(**kwargs)
    except pytesseract.TesseractNotFoundError as exc:
        raise HTTPException(status_code=500, detail="tesseract executable not found") from exc
    except pytesseract.TesseractError as exc:
        raise HTTPException(status_code=400, detail=f"ocr failed: {exc}") from exc

    blocks = []
    count = len(data.get("text", []))
    for idx in range(count):
        text = (data["text"][idx] or "").strip()
        if not text:
            continue

        raw_conf = str(data["conf"][idx]).strip()
        try:
            conf_0_100 = float(raw_conf)
        except ValueError:
            conf_0_100 = -1.0
        if conf_0_100 < 0:
            continue

        confidence = round(conf_0_100 / 100.0, 4)
        if confidence < min_confidence:
            continue

        left = int(data["left"][idx])
        top = int(data["top"][idx])
        width = int(data["width"][idx])
        height = int(data["height"][idx])

        blocks.append(
            {
                "text": text,
                "confidence": confidence,
                "bbox": {
                    "x": left + offset_x,
                    "y": top + offset_y,
                    "width": width,
                    "height": height,
                },
                "_sort": [top + offset_y, left + offset_x, idx],
            }
        )

    blocks.sort(key=lambda b: (b["_sort"][0], b["_sort"][1], b["_sort"][2]))
    for block in blocks:
        block.pop("_sort", None)
    return blocks


def _normalize_text(value: str) -> str:
    return re.sub(r"\s+", " ", value).strip()


def _matches_text(haystack: str, needle: str, match_mode: str) -> bool:
    if match_mode == "exact":
        return haystack == needle
    if match_mode == "regex":
        return re.search(needle, haystack) is not None
    return needle.lower() in haystack.lower()


def _windows_only(feature: str):
    if sys.platform != "win32":
        raise HTTPException(status_code=501, detail=f"{feature} is currently supported on Windows hosts only")


def _tasklist_process_name(pid: int) -> str | None:
    try:
        completed = subprocess.run(
            ["tasklist", "/FI", f"PID eq {pid}", "/FO", "CSV", "/NH"],
            capture_output=True,
            text=True,
            timeout=5,
            check=False,
        )
    except Exception:
        return None

    line = (completed.stdout or "").strip().splitlines()
    if not line:
        return None
    row = line[0].strip()
    if not row or row.startswith("INFO:"):
        return None
    if row.startswith('"') and '","' in row:
        return row.split('","', 1)[0].strip('"')
    return None


def _list_windows(query: WindowQuery | None = None) -> list[dict]:
    _windows_only("window endpoints")

    user32 = ctypes.windll.user32
    user32.EnumWindows.restype = ctypes.c_bool
    user32.EnumWindows.argtypes = [ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p), ctypes.c_void_p]
    user32.IsWindowVisible.argtypes = [ctypes.c_void_p]
    user32.IsWindowVisible.restype = ctypes.c_bool
    user32.IsWindowEnabled.argtypes = [ctypes.c_void_p]
    user32.IsWindowEnabled.restype = ctypes.c_bool
    user32.IsIconic.argtypes = [ctypes.c_void_p]
    user32.IsIconic.restype = ctypes.c_bool
    user32.IsZoomed.argtypes = [ctypes.c_void_p]
    user32.IsZoomed.restype = ctypes.c_bool
    user32.GetWindowTextLengthW.argtypes = [ctypes.c_void_p]
    user32.GetWindowTextLengthW.restype = ctypes.c_int
    user32.GetWindowTextW.argtypes = [ctypes.c_void_p, ctypes.c_wchar_p, ctypes.c_int]
    user32.GetClassNameW.argtypes = [ctypes.c_void_p, ctypes.c_wchar_p, ctypes.c_int]
    user32.GetClassNameW.restype = ctypes.c_int
    user32.GetForegroundWindow.restype = ctypes.c_void_p
    user32.GetWindowRect.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.wintypes.RECT)]

    foreground = int(user32.GetForegroundWindow() or 0)
    title_regex = re.compile(query.title_regex, re.IGNORECASE) if query and query.title_regex else None
    windows: list[dict] = []

    enum_proc = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)

    def _callback(hwnd, _lparam):
        hwnd_int = int(hwnd)
        if query and query.hwnd is not None and hwnd_int != query.hwnd:
            return True

        title_len = user32.GetWindowTextLengthW(hwnd)
        title_buf = ctypes.create_unicode_buffer(max(title_len + 1, 1))
        user32.GetWindowTextW(hwnd, title_buf, len(title_buf))
        title = title_buf.value

        visible = bool(user32.IsWindowVisible(hwnd))
        if query and query.visible_only and not visible:
            return True

        class_buf = ctypes.create_unicode_buffer(256)
        user32.GetClassNameW(hwnd, class_buf, len(class_buf))

        pid = ctypes.wintypes.DWORD()
        user32.GetWindowThreadProcessId(hwnd, ctypes.byref(pid))
        process_name = _tasklist_process_name(int(pid.value))

        rect = ctypes.wintypes.RECT()
        user32.GetWindowRect(hwnd, ctypes.byref(rect))

        window = {
            "hwnd": hwnd_int,
            "title": title,
            "class_name": class_buf.value,
            "pid": int(pid.value),
            "process_name": process_name,
            "visible": visible,
            "enabled": bool(user32.IsWindowEnabled(hwnd)),
            "minimized": bool(user32.IsIconic(hwnd)),
            "maximized": bool(user32.IsZoomed(hwnd)),
            "foreground": hwnd_int == foreground,
            "rect": {
                "x": int(rect.left),
                "y": int(rect.top),
                "width": int(rect.right - rect.left),
                "height": int(rect.bottom - rect.top),
            },
        }

        if query:
            if query.title_contains and query.title_contains.lower() not in title.lower():
                return True
            if title_regex and not title_regex.search(title):
                return True
            if query.process_name and (process_name or "").lower() != query.process_name.lower():
                return True

        windows.append(window)
        return True

    user32.EnumWindows(enum_proc(_callback), 0)
    windows.sort(key=lambda item: (not item["foreground"], item["title"].lower(), item["hwnd"]))
    return windows


def _require_window_match(query: WindowQuery) -> dict:
    matches = _list_windows(query)
    if not matches:
        raise HTTPException(status_code=404, detail="no matching window found")
    if len(matches) > 1 and query.hwnd is None:
        raise HTTPException(
            status_code=409,
            detail={"message": "multiple windows matched", "matches": matches[:10]},
        )
    return matches[0]


def _apply_window_action(req: WindowActionRequest) -> dict:
    _windows_only("window endpoints")
    match = _require_window_match(req)
    hwnd = match["hwnd"]
    user32 = ctypes.windll.user32
    WM_CLOSE = 0x0010
    SW_RESTORE = 9
    SW_MINIMIZE = 6
    SW_MAXIMIZE = 3

    if req.action in {"focus", "restore"}:
        user32.ShowWindow(hwnd, SW_RESTORE)
        ok = bool(user32.SetForegroundWindow(hwnd))
    elif req.action == "minimize":
        ok = bool(user32.ShowWindow(hwnd, SW_MINIMIZE))
    elif req.action == "maximize":
        ok = bool(user32.ShowWindow(hwnd, SW_MAXIMIZE))
    elif req.action == "close":
        ok = bool(user32.PostMessageW(hwnd, WM_CLOSE, 0, 0))
    else:
        raise HTTPException(status_code=400, detail="unsupported window action")

    deadline = time.time() + (req.timeout_ms / 1000.0)
    final_match = None
    while time.time() <= deadline:
        current = _list_windows(WindowQuery(hwnd=hwnd, visible_only=False))
        final_match = current[0] if current else None
        if req.action == "close" and final_match is None:
            break
        if req.action in {"focus", "restore"} and final_match and final_match["foreground"] and not final_match["minimized"]:
            break
        if req.action == "minimize" and final_match and final_match["minimized"]:
            break
        if req.action == "maximize" and final_match and final_match["maximized"]:
            break
        time.sleep(0.1)

    return {
        "ok": ok,
        "matched": match,
        "window": final_match,
        "closed": final_match is None,
    }


def _launch_app(req: LaunchRequest) -> dict:
    if req.cwd:
        cwd = os.path.abspath(req.cwd)
        if not os.path.isdir(cwd):
            raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory")
    else:
        cwd = None

    argv = [req.executable, *req.args]
    if SETTINGS["dry_run"] or req.dry_run:
        return {"executed": False, "dry_run": True, "argv": argv, "cwd": cwd}

    try:
        proc = subprocess.Popen(argv, cwd=cwd)
    except FileNotFoundError as exc:
        raise HTTPException(status_code=400, detail=f"executable not found: {exc}") from exc
    except OSError as exc:
        raise HTTPException(status_code=400, detail=f"failed to launch process: {exc}") from exc

    result = {
        "executed": True,
        "dry_run": False,
        "argv": argv,
        "cwd": cwd,
        "pid": proc.pid,
    }

    if req.wait_for_window:
        query = req.match or WindowQuery(process_name=os.path.basename(req.executable), visible_only=True)
        deadline = time.time() + (req.timeout_ms / 1000.0)
        match = None
        while time.time() <= deadline:
            matches = _list_windows(query)
            if matches:
                match = matches[0]
                break
            time.sleep(0.2)
        result["window"] = match
        result["window_found"] = match is not None

    return result


def _capture_region_image(screen: int, region_x: int | None, region_y: int | None, region_width: int | None, region_height: int | None):
    base_img, mon, displays, screen_selection = _capture_screen(screen)
    if None in {region_x, region_y, region_width, region_height}:
        return base_img, {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}, mon, displays, screen_selection

    left = region_x - mon["x"]
    top = region_y - mon["y"]
    right = left + region_width
    bottom = top + region_height
    if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
        raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")

    crop = base_img.crop((left, top, right, bottom))
    region = {"x": region_x, "y": region_y, "width": region_width, "height": region_height}
    return crop, region, mon, displays, screen_selection


def _capture_ocr_source(req: OCRRequest, screen: int = 0):
    source = req.mode
    if source == "image":
        image = _decode_image_base64(req.image_base64 or "")
        region = {"x": 0, "y": 0, "width": image.size[0], "height": image.size[1]}
        return image, region, None, None, None, source

    base_img, mon, displays, screen_selection = _capture_screen(screen)
    if source == "screen":
        image = base_img
        region = {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}
        return image, region, mon, displays, screen_selection, source

    left = req.region_x - mon["x"]
    top = req.region_y - mon["y"]
    right = left + req.region_width
    bottom = top + req.region_height
    if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
        raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")

    image = base_img.crop((left, top, right, bottom))
    region = {
        "x": req.region_x,
        "y": req.region_y,
        "width": req.region_width,
        "height": req.region_height,
    }
    return image, region, mon, displays, screen_selection, source


def _image_diff_ratio(before, after) -> float:
    diff = ImageChops.difference(before, after)
    stat = ImageStat.Stat(diff)
    channel_means = stat.mean if isinstance(stat.mean, list) else [stat.mean]
    return float(sum(channel_means) / (len(channel_means) * 255.0))


def _merge_bbox(blocks: list[dict]) -> dict:
    xs = [b["bbox"]["x"] for b in blocks]
    ys = [b["bbox"]["y"] for b in blocks]
    rights = [b["bbox"]["x"] + b["bbox"]["width"] for b in blocks]
    bottoms = [b["bbox"]["y"] + b["bbox"]["height"] for b in blocks]
    return {
        "x": min(xs),
        "y": min(ys),
        "width": max(rights) - min(xs),
        "height": max(bottoms) - min(ys),
    }


def _group_ocr_lines(blocks: list[dict]) -> list[dict]:
    if not blocks:
        return []

    sorted_blocks = sorted(blocks, key=lambda b: (b["bbox"]["y"], b["bbox"]["x"]))
    lines: list[list[dict]] = []
    current: list[dict] = []
    current_center = None

    for block in sorted_blocks:
        bbox = block["bbox"]
        center_y = bbox["y"] + (bbox["height"] / 2)
        tolerance = max(10.0, bbox["height"] * 0.8)
        if current and current_center is not None and abs(center_y - current_center) > tolerance:
            lines.append(sorted(current, key=lambda item: item["bbox"]["x"]))
            current = []
            current_center = None
        current.append(block)
        current_center = sum(item["bbox"]["y"] + (item["bbox"]["height"] / 2) for item in current) / len(current)

    if current:
        lines.append(sorted(current, key=lambda item: item["bbox"]["x"]))

    grouped = []
    for idx, line_blocks in enumerate(lines):
        text = " ".join(item["text"] for item in line_blocks).strip()
        if not text:
            continue
        grouped.append(
            {
                "text": text,
                "confidence": round(sum(item["confidence"] for item in line_blocks) / len(line_blocks), 4),
                "bbox": _merge_bbox(line_blocks),
                "blocks": line_blocks,
                "line_index": idx,
            }
        )
    return grouped


def _find_text_matches(blocks: list[dict], query: str, match_mode: str, group_lines: bool, max_results: int) -> list[dict]:
    target = _normalize_text(query)
    candidates = _group_ocr_lines(blocks) if group_lines else blocks
    matches = []
    for item in candidates:
        normalized = _normalize_text(item["text"])
        if not normalized:
            continue
        if _matches_text(normalized, target, match_mode):
            match = {
                "text": item["text"],
                "normalized_text": normalized,
                "confidence": item["confidence"],
                "bbox": item["bbox"],
                "grouped": group_lines,
            }
            if group_lines:
                match["blocks"] = item["blocks"]
                match["line_index"] = item["line_index"]
            matches.append(match)

    matches.sort(key=lambda item: (-item["confidence"], item["bbox"]["y"], item["bbox"]["x"]))
    return matches[:max_results]


def _compute_visual_diff(req: VisionDiffRequest, screen: int = 0) -> dict:
    if req.mode == "image":
        before = _decode_image_base64(req.before_image_base64 or "")
        after = _decode_image_base64(req.after_image_base64 or "")
        if before.size != after.size:
            raise HTTPException(status_code=400, detail="before and after images must have matching dimensions")
        diff_ratio = _image_diff_ratio(before, after)
        return {
            "mode": req.mode,
            "region": {"x": 0, "y": 0, "width": before.size[0], "height": before.size[1]},
            "diff_ratio": diff_ratio,
            "changed": diff_ratio >= req.diff_threshold,
            "diff_threshold": req.diff_threshold,
        }

    before, region, mon, displays, screen_selection = _capture_region_image(
        screen,
        req.region_x,
        req.region_y,
        req.region_width,
        req.region_height,
    )
    if req.delay_ms > 0:
        time.sleep(req.delay_ms / 1000.0)
    after, _, _, _, _ = _capture_region_image(
        screen,
        region["x"],
        region["y"],
        region["width"],
        region["height"],
    )
    diff_ratio = _image_diff_ratio(before, after)
    return {
        "mode": req.mode,
        "region": region,
        "diff_ratio": diff_ratio,
        "changed": diff_ratio >= req.diff_threshold,
        "diff_threshold": req.diff_threshold,
        "screen": screen_selection,
        "display": mon,
        "delay_ms": req.delay_ms,
    }


def _measure_stability(req: VisionStabilityRequest, screen: int = 0) -> dict:
    baseline, region, mon, displays, screen_selection = _capture_region_image(
        screen,
        req.region_x,
        req.region_y,
        req.region_width,
        req.region_height,
    )
    sample_count = 0
    max_diff_ratio = 0.0
    diffs = []
    deadline = time.time() + (req.duration_ms / 1000.0)
    while time.time() < deadline:
        time.sleep(req.sample_interval_ms / 1000.0)
        current, _, _, _, _ = _capture_region_image(
            screen,
            region["x"],
            region["y"],
            region["width"],
            region["height"],
        )
        diff_ratio = _image_diff_ratio(baseline, current)
        diffs.append(diff_ratio)
        max_diff_ratio = max(max_diff_ratio, diff_ratio)
        sample_count += 1
        baseline = current

    return {
        "stable": max_diff_ratio <= req.diff_threshold,
        "region": region,
        "sample_count": sample_count,
        "max_diff_ratio": max_diff_ratio,
        "avg_diff_ratio": round(sum(diffs) / len(diffs), 6) if diffs else 0.0,
        "diff_threshold": req.diff_threshold,
        "duration_ms": req.duration_ms,
        "sample_interval_ms": req.sample_interval_ms,
        "screen": screen_selection,
        "display": mon,
    }


def _run_verified_action(req: VerifyActionRequest, screen: int = 0) -> dict:
    attempts = []
    for attempt in range(req.retries + 1):
        action_ok = True
        action_result = None
        action_error = None
        try:
            action_result = _exec_action(req.action, screen)
        except Exception as exc:
            action_ok = False
            action_error = str(exc)
            if req.stop_on_action_error:
                attempts.append(
                    {
                        "attempt": attempt,
                        "action_ok": action_ok,
                        "action_error": action_error,
                        "verification": None,
                    }
                )
                return {"success": False, "attempts": attempts, "final_attempt": attempt}

        verification = _wait_for_condition(
            WaitRequest(
                condition=req.condition,
                timeout_ms=req.timeout_ms,
                poll_interval_ms=req.poll_interval_ms,
            ),
            screen,
        )
        attempts.append(
            {
                "attempt": attempt,
                "action_ok": action_ok,
                "action_error": action_error,
                "action_result": action_result,
                "verification": verification,
            }
        )
        if verification.get("satisfied"):
            return {"success": True, "attempts": attempts, "final_attempt": attempt}
        if attempt < req.retries and req.retry_delay_ms > 0:
            time.sleep(req.retry_delay_ms / 1000.0)

    return {"success": False, "attempts": attempts, "final_attempt": req.retries}


def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict:
    condition = req.condition
    deadline = time.time() + (req.timeout_ms / 1000.0)
    polls = 0

    if isinstance(condition, WaitVisualCondition):
        baseline, region, mon, displays, screen_selection = _capture_region_image(
            screen,
            condition.region_x,
            condition.region_y,
            condition.region_width,
            condition.region_height,
        )
        stable_since = None
        last_diff = 0.0
        while True:
            if time.time() > deadline:
                return {
                    "satisfied": False,
                    "kind": condition.kind,
                    "state": condition.state,
                    "polls": polls,
                    "region": region,
                    "diff_ratio": last_diff,
                    "screen": screen_selection,
                    "display": mon,
                }
            time.sleep(req.poll_interval_ms / 1000.0)
            current, _, _, _, _ = _capture_region_image(
                screen,
                region["x"],
                region["y"],
                region["width"],
                region["height"],
            )
            polls += 1
            last_diff = _image_diff_ratio(baseline, current)
            if condition.state == "change":
                if last_diff >= condition.diff_threshold:
                    return {
                        "satisfied": True,
                        "kind": condition.kind,
                        "state": condition.state,
                        "polls": polls,
                        "region": region,
                        "diff_ratio": last_diff,
                        "screen": screen_selection,
                        "display": mon,
                    }
            else:
                if last_diff <= condition.diff_threshold:
                    stable_since = stable_since or time.time()
                    if (time.time() - stable_since) * 1000 >= condition.stable_for_ms:
                        return {
                            "satisfied": True,
                            "kind": condition.kind,
                            "state": condition.state,
                            "polls": polls,
                            "region": region,
                            "diff_ratio": last_diff,
                            "stable_for_ms": int((time.time() - stable_since) * 1000),
                            "screen": screen_selection,
                            "display": mon,
                        }
                else:
                    stable_since = None
            baseline = current

    while True:
        if isinstance(condition, WaitWindowCondition):
            matches = _list_windows(condition)
            polls += 1
            satisfied = False
            if condition.state == "exists":
                satisfied = bool(matches)
            elif condition.state == "focused":
                satisfied = any(item["foreground"] for item in matches)
            elif condition.state == "closed":
                satisfied = not matches
            if satisfied:
                return {
                    "satisfied": True,
                    "kind": condition.kind,
                    "state": condition.state,
                    "polls": polls,
                    "matches": matches[:10],
                }
        elif isinstance(condition, WaitTextCondition):
            image, region, mon, displays, screen_selection = _capture_region_image(
                screen,
                condition.region_x,
                condition.region_y,
                condition.region_width,
                condition.region_height,
            )
            blocks = _run_ocr(
                image,
                condition.language_hint,
                condition.min_confidence,
                region["x"],
                region["y"],
            )
            polls += 1
            matched = []
            for block in blocks:
                normalized = _normalize_text(block["text"])
                target = _normalize_text(condition.text)
                if _matches_text(normalized, target, condition.match):
                    matched.append(block)
            satisfied = bool(matched) if condition.present else not bool(matched)
            if satisfied:
                return {
                    "satisfied": True,
                    "kind": condition.kind,
                    "mode": condition.mode,
                    "polls": polls,
                    "region": region,
                    "matches": matched,
                    "screen": screen_selection,
                    "display": mon,
                }
        else:
            raise HTTPException(status_code=400, detail="unsupported wait condition")

        if time.time() > deadline:
            return {
                "satisfied": False,
                "kind": condition.kind,
                "polls": polls,
            }
        time.sleep(req.poll_interval_ms / 1000.0)


def _pick_shell(explicit_shell: str | None) -> str:
    shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip()
    if shell_name not in {"powershell", "bash", "cmd"}:
        raise HTTPException(status_code=400, detail="unsupported shell")
    return shell_name


def _truncate_text(text: str, limit: int) -> tuple[str, bool]:
    if len(text) <= limit:
        return text, False
    return text[:limit], True


def _resolve_exec_program(shell_name: str, command: str) -> list[str]:
    if shell_name == "powershell":
        return ["powershell", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-Command", command]
    if shell_name == "bash":
        return ["bash", "-lc", command]
    if shell_name == "cmd":
        return ["cmd", "/c", command]
    raise HTTPException(status_code=400, detail="unsupported shell")


def _exec_command(req: ExecRequest) -> dict:
    if not SETTINGS["exec_enabled"]:
        raise HTTPException(status_code=403, detail="exec endpoint disabled")
    if not SETTINGS["exec_secret"]:
        raise HTTPException(status_code=403, detail="exec secret not configured")

    run_dry = SETTINGS["dry_run"] or req.dry_run
    shell_name = _pick_shell(req.shell)

    timeout_s = req.timeout_s if req.timeout_s is not None else SETTINGS["exec_default_timeout_s"]
    timeout_s = min(timeout_s, SETTINGS["exec_max_timeout_s"])

    cwd = None
    if req.cwd:
        cwd = os.path.abspath(req.cwd)
        if not os.path.isdir(cwd):
            raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory")

    argv = _resolve_exec_program(shell_name, req.command)

    if run_dry:
        return {
            "executed": False,
            "dry_run": True,
            "shell": shell_name,
            "command": req.command,
            "argv": argv,
            "timeout_s": timeout_s,
            "cwd": cwd,
        }

    start = time.time()
    try:
        completed = subprocess.run(
            argv,
            cwd=cwd,
            capture_output=True,
            text=True,
            timeout=timeout_s,
            check=False,
        )
    except subprocess.TimeoutExpired as exc:
        stdout = exc.stdout or ""
        stderr = exc.stderr or ""
        stdout, stdout_truncated = _truncate_text(str(stdout), SETTINGS["exec_max_output_chars"])
        stderr, stderr_truncated = _truncate_text(str(stderr), SETTINGS["exec_max_output_chars"])
        return {
            "executed": True,
            "timed_out": True,
            "shell": shell_name,
            "command": req.command,
            "argv": argv,
            "timeout_s": timeout_s,
            "cwd": cwd,
            "duration_ms": int((time.time() - start) * 1000),
            "exit_code": None,
            "stdout": stdout,
            "stderr": stderr,
            "stdout_truncated": stdout_truncated,
            "stderr_truncated": stderr_truncated,
        }
    except FileNotFoundError as exc:
        raise HTTPException(status_code=400, detail=f"shell executable not found: {exc}") from exc

    stdout, stdout_truncated = _truncate_text(completed.stdout or "", SETTINGS["exec_max_output_chars"])
    stderr, stderr_truncated = _truncate_text(completed.stderr or "", SETTINGS["exec_max_output_chars"])

    return {
        "executed": True,
        "timed_out": False,
        "shell": shell_name,
        "command": req.command,
        "argv": argv,
        "timeout_s": timeout_s,
        "cwd": cwd,
        "duration_ms": int((time.time() - start) * 1000),
        "exit_code": completed.returncode,
        "stdout": stdout,
        "stderr": stderr,
        "stdout_truncated": stdout_truncated,
        "stderr_truncated": stderr_truncated,
    }


def _exec_action(req: ActionRequest, screen: int = 0) -> dict:
    run_dry = SETTINGS["dry_run"] or req.dry_run
    selected_display, displays, screen_selection = _select_display(screen)

    pyautogui = None if run_dry else _import_input_lib()
    resolved_target = None

    if req.target is not None:
        x, y, info = _resolve_target(req.target)
        _enforce_allowed_region(x, y)
        resolved_target = {"x": x, "y": y, "target_info": info}

    duration_sec = req.duration_ms / 1000.0

    if req.action in {"move", "click", "right_click", "double_click", "middle_click"} and resolved_target is None:
        raise HTTPException(status_code=400, detail="target is required for pointer actions")

    if req.action == "scroll" and resolved_target is None:
        raise HTTPException(status_code=400, detail="target is required for scroll")

    if not run_dry:
        if req.action == "move":
            pyautogui.moveTo(resolved_target["x"], resolved_target["y"], duration=duration_sec)

        elif req.action == "click":
            pyautogui.click(
                x=resolved_target["x"],
                y=resolved_target["y"],
                clicks=req.clicks,
                interval=req.interval_ms / 1000.0,
                button=req.button,
                duration=duration_sec,
            )

        elif req.action == "right_click":
            pyautogui.click(x=resolved_target["x"], y=resolved_target["y"], button="right", duration=duration_sec)

        elif req.action == "double_click":
            pyautogui.doubleClick(x=resolved_target["x"], y=resolved_target["y"], interval=req.interval_ms / 1000.0)

        elif req.action == "middle_click":
            pyautogui.click(x=resolved_target["x"], y=resolved_target["y"], button="middle", duration=duration_sec)

        elif req.action == "scroll":
            pyautogui.moveTo(resolved_target["x"], resolved_target["y"], duration=duration_sec)
            pyautogui.scroll(req.scroll_amount)

        elif req.action == "type":
            pyautogui.write(req.text, interval=req.interval_ms / 1000.0)

        elif req.action == "hotkey":
            if len(req.keys) < 1:
                raise HTTPException(status_code=400, detail="keys is required for hotkey")
            pyautogui.hotkey(*req.keys)

    return {
        "action": req.action,
        "executed": not run_dry,
        "dry_run": run_dry,
        "screen": screen_selection,
        "display": selected_display,
        "resolved_target": resolved_target,
    }


def _localization_confidence(source: str, confidence: float | None = None) -> str:
    if source == "image_tool_point":
        return "high"
    if source == "ocr" and confidence is not None:
        if confidence >= 0.8:
            return "high"
        if confidence >= 0.55:
            return "medium"
    return "low"


@app.post("/v2/observe")
def observe_v2(req: ObserveRequestV2, screen: int = 0, _: None = Depends(_auth)):
    capture_started = time.perf_counter()
    image, region, mon, displays, screen_selection = _capture_region_image(
        screen,
        req.region_x if req.mode == "region" else None,
        req.region_y if req.mode == "region" else None,
        req.region_width if req.mode == "region" else None,
        req.region_height if req.mode == "region" else None,
    )
    capture_ms = int((time.perf_counter() - capture_started) * 1000)

    encoded = None
    if req.include_image:
        encoded = _encode_image(image, req.image_format, req.jpeg_quality)

    ocr_started = time.perf_counter()
    blocks: list[dict] = []
    grouped_lines: list[dict] = []
    ocr_applied_mode = "none"
    if req.ocr_mode != "none":
        if req.ocr_mode == "screen":
            ocr_image, ocr_region, _, _, _ = _capture_region_image(screen, None, None, None, None)
        else:
            ocr_image, ocr_region = image, region

        area = ocr_region["width"] * ocr_region["height"]
        if req.max_ocr_area_px is not None and area > req.max_ocr_area_px:
            raise HTTPException(
                status_code=400,
                detail=f"ocr area {area} exceeds max_ocr_area_px {req.max_ocr_area_px}",
            )

        blocks = _run_ocr(
            ocr_image,
            req.language_hint,
            req.min_confidence,
            ocr_region["x"],
            ocr_region["y"],
        )
        if req.group_lines:
            grouped_lines = _group_ocr_lines(blocks)
        ocr_applied_mode = req.ocr_mode
    ocr_ms = int((time.perf_counter() - ocr_started) * 1000)

    observation_id = _request_id()
    OBSERVATIONS[observation_id] = {
        "id": observation_id,
        "region": region,
        "screen": screen_selection,
        "display": mon,
        "image_width": image.size[0],
        "image_height": image.size[1],
        "ocr_blocks": blocks,
        "ocr_lines": grouped_lines,
        "created_at_ms": _now_ms(),
    }

    return _ok(
        {
            "observation_id": observation_id,
            "region": region,
            "screen": screen_selection,
            "display": mon,
            "image": {
                "included": req.include_image,
                "format": req.image_format if req.include_image else None,
                "base64": encoded,
                "width": image.size[0],
                "height": image.size[1],
            },
            "ocr": {
                "mode": ocr_applied_mode,
                "min_confidence": req.min_confidence,
                "language_hint": req.language_hint,
                "block_count": len(blocks),
                "line_count": len(grouped_lines),
                "blocks": blocks,
                "lines": grouped_lines,
            },
            "timing_ms": {
                "capture_ms": capture_ms,
                "ocr_ms": ocr_ms if req.ocr_mode != "none" else 0,
                "total_ms": capture_ms + (ocr_ms if req.ocr_mode != "none" else 0),
            },
        }
    )


@app.post("/v2/localize")
def localize_v2(req: LocalizeRequestV2, _: None = Depends(_auth)):
    observation = _get_observation(req.observation_id)
    region = observation["region"]
    image_width = observation["image_width"]
    image_height = observation["image_height"]

    if req.image_tool_point is not None:
        if req.image_tool_point.x >= image_width or req.image_tool_point.y >= image_height:
            raise HTTPException(status_code=400, detail="image_tool_point outside observation image bounds")
        x = region["x"] + req.image_tool_point.x
        y = region["y"] + req.image_tool_point.y
        _enforce_allowed_region(x, y)
        resolved_target_id = _request_id()
        RESOLVED_TARGETS[resolved_target_id] = {
            "id": resolved_target_id,
            "observation_id": req.observation_id,
            "x": x,
            "y": y,
            "source": "image_tool_point",
        }
        return _ok(
            {
                "resolved_target_id": resolved_target_id,
                "source": "image_tool_point",
                "localization_confidence": _localization_confidence("image_tool_point"),
                "pixel": {"x": x, "y": y},
                "observation_region": region,
                "image_bounds": {"width": image_width, "height": image_height},
            }
        )

    lines = observation.get("ocr_lines") or _group_ocr_lines(observation.get("ocr_blocks", []))
    matches = _find_text_matches(lines, req.text_query or "", req.text_match, False, 200)
    if not matches:
        return _err("not_found", "no localization candidates found", 404, {"found": False, "matches": []})
    if req.candidate_index >= len(matches):
        raise HTTPException(status_code=400, detail="candidate_index is outside match results")

    chosen = matches[req.candidate_index]
    bbox = chosen["bbox"]
    x = bbox["x"] + max(1, bbox["width"] // 2)
    y = bbox["y"] + max(1, bbox["height"] // 2)
    _enforce_allowed_region(x, y)
    resolved_target_id = _request_id()
    RESOLVED_TARGETS[resolved_target_id] = {
        "id": resolved_target_id,
        "observation_id": req.observation_id,
        "x": x,
        "y": y,
        "source": "ocr",
        "match": chosen,
    }

    return _ok(
        {
            "resolved_target_id": resolved_target_id,
            "source": "ocr",
            "localization_confidence": _localization_confidence("ocr", chosen.get("confidence")),
            "pixel": {"x": x, "y": y},
            "selected_match": chosen,
            "match_count": len(matches),
        }
    )


@app.post("/v2/act")
def act_v2(req: ActRequestV2, screen: int = 0, _: None = Depends(_auth)):
    legacy_action = _resolve_v2_action(req.action)
    result = _exec_action(legacy_action, screen)
    return _ok(result)


@app.post("/v2/act-verify")
def act_verify_v2(req: ActVerifyRequestV2, screen: int = 0, _: None = Depends(_auth)):
    defaults = _risk_defaults(req.risk_level)
    verify_req = VerifyActionRequest(
        action=_resolve_v2_action(req.action),
        condition=req.condition,
        retries=defaults["retries"] if req.retries is None else req.retries,
        timeout_ms=defaults["timeout_ms"] if req.timeout_ms is None else req.timeout_ms,
        poll_interval_ms=defaults["poll_interval_ms"] if req.poll_interval_ms is None else req.poll_interval_ms,
        retry_delay_ms=defaults["retry_delay_ms"] if req.retry_delay_ms is None else req.retry_delay_ms,
        stop_on_action_error=req.stop_on_action_error,
    )
    result = _run_verified_action(verify_req, screen)
    payload = {
        "risk_level": req.risk_level,
        "defaults_applied": defaults,
        **result,
    }
    if result.get("success", False):
        return _ok(payload)
    return _err("verification_failed", "action verification did not satisfy condition", 409, payload)


@app.get("/health")
def health(_: None = Depends(_auth)):
    return _ok(
        {
            "service": "clickthrough",
            "version": app.version,
            "dry_run": SETTINGS["dry_run"],
            "allowed_region": SETTINGS["allowed_region"],
            "exec": {
                "enabled": SETTINGS["exec_enabled"],
                "secret_configured": bool(SETTINGS["exec_secret"]),
                "default_shell": SETTINGS["exec_default_shell"],
                "default_timeout_s": SETTINGS["exec_default_timeout_s"],
                "max_timeout_s": SETTINGS["exec_max_timeout_s"],
            },
        }
    )


@app.get("/displays")
def displays(_: None = Depends(_auth)):
    detected = _get_displays()
    return _ok({"displays": detected, "default_screen": 0})


@app.post("/exec")
def exec_command(
    req: ExecRequest,
    x_clickthrough_exec_secret: Optional[str] = Header(default=None),
    _: None = Depends(_auth),
):
    expected = SETTINGS["exec_secret"]
    if not expected:
        raise HTTPException(status_code=403, detail="exec secret not configured")
    if not x_clickthrough_exec_secret or not hmac.compare_digest(x_clickthrough_exec_secret, expected):
        raise HTTPException(status_code=401, detail="invalid exec secret")

    result = _exec_command(req)
    return _ok(result)


@app.get("/windows")
def windows(
    title_contains: str | None = None,
    title_regex: str | None = None,
    process_name: str | None = None,
    hwnd: int | None = None,
    visible_only: bool = True,
    _: None = Depends(_auth),
):
    query = WindowQuery(
        title_contains=title_contains,
        title_regex=title_regex,
        process_name=process_name,
        hwnd=hwnd,
        visible_only=visible_only,
    )
    matches = _list_windows(query)
    return _ok({"windows": matches, "count": len(matches)})


@app.post("/windows/action")
def window_action(req: WindowActionRequest, _: None = Depends(_auth)):
    result = _apply_window_action(req)
    return _ok(result)


@app.post("/launch")
def launch(req: LaunchRequest, _: None = Depends(_auth)):
    result = _launch_app(req)
    return _ok(result)


if __name__ == "__main__":
    import uvicorn

    uvicorn.run("server.app:app", host=SETTINGS["host"], port=SETTINGS["port"], reload=False)