All checks were successful
python-syntax / syntax-check (push) Successful in 7s
1842 lines
64 KiB
Python
1842 lines
64 KiB
Python
import base64
|
|
import ctypes
|
|
import hmac
|
|
import io
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import uuid
|
|
from typing import Any, Literal, Optional
|
|
|
|
from dotenv import load_dotenv
|
|
from fastapi import Depends, FastAPI, Header, HTTPException, Request
|
|
from fastapi.exceptions import RequestValidationError
|
|
from fastapi.responses import JSONResponse
|
|
from PIL import ImageChops, ImageStat
|
|
from pydantic import BaseModel, Field, model_validator
|
|
|
|
|
|
load_dotenv(dotenv_path=".env", override=False)
|
|
|
|
app = FastAPI(title="clickthrough", version="0.1.0")
|
|
|
|
|
|
def _ok(data: Any, status_code: int = 200):
|
|
return JSONResponse(
|
|
status_code=status_code,
|
|
content={
|
|
"ok": True,
|
|
"request_id": _request_id(),
|
|
"time_ms": _now_ms(),
|
|
"data": data,
|
|
"error": None,
|
|
},
|
|
)
|
|
|
|
|
|
def _err(code: str, message: str, status_code: int, details: Any = None):
|
|
return JSONResponse(
|
|
status_code=status_code,
|
|
content={
|
|
"ok": False,
|
|
"request_id": _request_id(),
|
|
"time_ms": _now_ms(),
|
|
"data": None,
|
|
"error": {
|
|
"code": code,
|
|
"message": message,
|
|
"details": details,
|
|
},
|
|
},
|
|
)
|
|
|
|
|
|
@app.exception_handler(HTTPException)
|
|
async def _http_exception_handler(_: Request, exc: HTTPException):
|
|
detail = exc.detail
|
|
if isinstance(detail, dict):
|
|
message = str(detail.get("message", "request failed"))
|
|
return _err("http_error", message, exc.status_code, detail)
|
|
return _err("http_error", str(detail), exc.status_code)
|
|
|
|
|
|
@app.exception_handler(Exception)
|
|
async def _unhandled_exception_handler(_: Request, exc: Exception):
|
|
return _err("internal_error", "internal server error", 500, {"type": type(exc).__name__})
|
|
|
|
|
|
@app.exception_handler(RequestValidationError)
|
|
async def _validation_exception_handler(_: Request, exc: RequestValidationError):
|
|
return _err("validation_error", "request validation failed", 422, exc.errors())
|
|
|
|
|
|
def _env_bool(name: str, default: bool) -> bool:
|
|
raw = os.getenv(name)
|
|
if raw is None:
|
|
return default
|
|
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
|
|
|
|
|
def _parse_allowed_region() -> Optional[tuple[int, int, int, int]]:
|
|
raw = os.getenv("CLICKTHROUGH_ALLOWED_REGION")
|
|
if not raw:
|
|
return None
|
|
parts = [p.strip() for p in raw.split(",")]
|
|
if len(parts) != 4:
|
|
raise ValueError("CLICKTHROUGH_ALLOWED_REGION must be x,y,width,height")
|
|
x, y, w, h = (int(p) for p in parts)
|
|
if w <= 0 or h <= 0:
|
|
raise ValueError("CLICKTHROUGH_ALLOWED_REGION width/height must be > 0")
|
|
return x, y, w, h
|
|
|
|
|
|
SETTINGS = {
|
|
"host": os.getenv("CLICKTHROUGH_HOST", "127.0.0.1"),
|
|
"port": int(os.getenv("CLICKTHROUGH_PORT", "8123")),
|
|
"token": os.getenv("CLICKTHROUGH_TOKEN", "").strip(),
|
|
"dry_run": _env_bool("CLICKTHROUGH_DRY_RUN", False),
|
|
"default_grid_rows": int(os.getenv("CLICKTHROUGH_GRID_ROWS", "12")),
|
|
"default_grid_cols": int(os.getenv("CLICKTHROUGH_GRID_COLS", "12")),
|
|
"allowed_region": _parse_allowed_region(),
|
|
"exec_enabled": _env_bool("CLICKTHROUGH_EXEC_ENABLED", True),
|
|
"exec_default_shell": os.getenv("CLICKTHROUGH_EXEC_DEFAULT_SHELL", "powershell").strip().lower(),
|
|
"exec_default_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_TIMEOUT_S", "30")),
|
|
"exec_max_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_MAX_TIMEOUT_S", "120")),
|
|
"exec_max_output_chars": int(os.getenv("CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS", "20000")),
|
|
"exec_secret": os.getenv("CLICKTHROUGH_EXEC_SECRET", "").strip(),
|
|
"tesseract_cmd": os.getenv("CLICKTHROUGH_TESSERACT_CMD", "").strip(),
|
|
}
|
|
|
|
|
|
class ScreenRequest(BaseModel):
|
|
with_grid: bool = True
|
|
grid_rows: int = Field(default=SETTINGS["default_grid_rows"], ge=1, le=200)
|
|
grid_cols: int = Field(default=SETTINGS["default_grid_cols"], ge=1, le=200)
|
|
include_labels: bool = True
|
|
image_format: Literal["png", "jpeg"] = "png"
|
|
jpeg_quality: int = Field(default=85, ge=1, le=100)
|
|
|
|
|
|
class ZoomRequest(BaseModel):
|
|
center_x: int = Field(ge=0)
|
|
center_y: int = Field(ge=0)
|
|
width: int = Field(default=500, ge=10)
|
|
height: int = Field(default=350, ge=10)
|
|
with_grid: bool = True
|
|
grid_rows: int = Field(default=20, ge=1, le=300)
|
|
grid_cols: int = Field(default=20, ge=1, le=300)
|
|
include_labels: bool = True
|
|
image_format: Literal["png", "jpeg"] = "png"
|
|
jpeg_quality: int = Field(default=90, ge=1, le=100)
|
|
|
|
|
|
class PixelTarget(BaseModel):
|
|
mode: Literal["pixel"]
|
|
x: int
|
|
y: int
|
|
dx: int = 0
|
|
dy: int = 0
|
|
|
|
|
|
class GridTarget(BaseModel):
|
|
mode: Literal["grid"]
|
|
region_x: int
|
|
region_y: int
|
|
region_width: int = Field(gt=0)
|
|
region_height: int = Field(gt=0)
|
|
rows: int = Field(gt=0)
|
|
cols: int = Field(gt=0)
|
|
row: int = Field(ge=0)
|
|
col: int = Field(ge=0)
|
|
dx: float = 0.0
|
|
dy: float = 0.0
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_indices(self):
|
|
if self.row >= self.rows or self.col >= self.cols:
|
|
raise ValueError("row/col must be inside rows/cols")
|
|
if not -1.0 <= self.dx <= 1.0:
|
|
raise ValueError("dx must be in [-1, 1]")
|
|
if not -1.0 <= self.dy <= 1.0:
|
|
raise ValueError("dy must be in [-1, 1]")
|
|
return self
|
|
|
|
|
|
Target = PixelTarget | GridTarget
|
|
|
|
|
|
class ActionRequest(BaseModel):
|
|
action: Literal[
|
|
"move",
|
|
"click",
|
|
"right_click",
|
|
"double_click",
|
|
"middle_click",
|
|
"scroll",
|
|
"type",
|
|
"hotkey",
|
|
]
|
|
target: Optional[Target] = None
|
|
duration_ms: int = Field(default=0, ge=0, le=20000)
|
|
button: Literal["left", "right", "middle"] = "left"
|
|
clicks: int = Field(default=1, ge=1, le=10)
|
|
scroll_amount: int = 0
|
|
text: str = ""
|
|
keys: list[str] = Field(default_factory=list)
|
|
interval_ms: int = Field(default=20, ge=0, le=5000)
|
|
dry_run: bool = False
|
|
|
|
|
|
class BatchRequest(BaseModel):
|
|
actions: list[ActionRequest] = Field(min_length=1, max_length=100)
|
|
stop_on_error: bool = True
|
|
|
|
|
|
class ExecRequest(BaseModel):
|
|
command: str = Field(min_length=1, max_length=10000)
|
|
shell: Literal["powershell", "bash", "cmd"] | None = None
|
|
timeout_s: int | None = Field(default=None, ge=1, le=600)
|
|
cwd: str | None = None
|
|
dry_run: bool = False
|
|
|
|
|
|
class OCRRequest(BaseModel):
|
|
mode: Literal["screen", "region", "image"] = "screen"
|
|
region_x: int | None = Field(default=None, ge=0)
|
|
region_y: int | None = Field(default=None, ge=0)
|
|
region_width: int | None = Field(default=None, gt=0)
|
|
region_height: int | None = Field(default=None, gt=0)
|
|
image_base64: str | None = None
|
|
language_hint: str | None = Field(default=None, min_length=1, max_length=64)
|
|
min_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_mode_inputs(self):
|
|
if self.mode == "region":
|
|
required = [self.region_x, self.region_y, self.region_width, self.region_height]
|
|
if any(v is None for v in required):
|
|
raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
|
|
if self.mode == "image" and not self.image_base64:
|
|
raise ValueError("image_base64 is required for mode=image")
|
|
return self
|
|
|
|
|
|
class WindowQuery(BaseModel):
|
|
title_contains: str | None = Field(default=None, max_length=512)
|
|
title_regex: str | None = Field(default=None, max_length=512)
|
|
process_name: str | None = Field(default=None, max_length=260)
|
|
hwnd: int | None = Field(default=None, ge=1)
|
|
visible_only: bool = True
|
|
|
|
|
|
class WindowActionRequest(WindowQuery):
|
|
action: Literal["focus", "restore", "minimize", "maximize", "close"]
|
|
timeout_ms: int = Field(default=3000, ge=0, le=60000)
|
|
|
|
|
|
class LaunchRequest(BaseModel):
|
|
executable: str = Field(min_length=1, max_length=2048)
|
|
args: list[str] = Field(default_factory=list, max_length=100)
|
|
cwd: str | None = None
|
|
wait_for_window: bool = False
|
|
match: WindowQuery | None = None
|
|
timeout_ms: int = Field(default=5000, ge=0, le=120000)
|
|
dry_run: bool = False
|
|
|
|
|
|
class WaitTextCondition(BaseModel):
|
|
kind: Literal["text"]
|
|
mode: Literal["screen", "region"] = "screen"
|
|
text: str = Field(min_length=1, max_length=512)
|
|
match: Literal["contains", "exact", "regex"] = "contains"
|
|
present: bool = True
|
|
region_x: int | None = Field(default=None, ge=0)
|
|
region_y: int | None = Field(default=None, ge=0)
|
|
region_width: int | None = Field(default=None, gt=0)
|
|
region_height: int | None = Field(default=None, gt=0)
|
|
language_hint: str | None = Field(default=None, min_length=1, max_length=64)
|
|
min_confidence: float = Field(default=0.0, ge=0.0, le=1.0)
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_region(self):
|
|
if self.mode == "region":
|
|
required = [self.region_x, self.region_y, self.region_width, self.region_height]
|
|
if any(v is None for v in required):
|
|
raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
|
|
return self
|
|
|
|
|
|
class WaitWindowCondition(WindowQuery):
|
|
kind: Literal["window"]
|
|
state: Literal["exists", "focused", "closed"] = "exists"
|
|
|
|
|
|
class WaitVisualCondition(BaseModel):
|
|
kind: Literal["visual"]
|
|
state: Literal["change", "stable"] = "change"
|
|
region_x: int | None = Field(default=None, ge=0)
|
|
region_y: int | None = Field(default=None, ge=0)
|
|
region_width: int | None = Field(default=None, gt=0)
|
|
region_height: int | None = Field(default=None, gt=0)
|
|
diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)
|
|
stable_for_ms: int = Field(default=800, ge=0, le=60000)
|
|
|
|
|
|
class WaitRequest(BaseModel):
|
|
condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
|
|
timeout_ms: int = Field(default=5000, ge=0, le=120000)
|
|
poll_interval_ms: int = Field(default=250, ge=50, le=10000)
|
|
|
|
|
|
class OCRFindRequest(OCRRequest):
|
|
query: str = Field(min_length=1, max_length=512)
|
|
match: Literal["contains", "exact", "regex"] = "contains"
|
|
group_lines: bool = True
|
|
max_results: int = Field(default=20, ge=1, le=200)
|
|
|
|
|
|
class VisionDiffRequest(BaseModel):
|
|
mode: Literal["screen", "region", "image"] = "screen"
|
|
region_x: int | None = Field(default=None, ge=0)
|
|
region_y: int | None = Field(default=None, ge=0)
|
|
region_width: int | None = Field(default=None, gt=0)
|
|
region_height: int | None = Field(default=None, gt=0)
|
|
before_image_base64: str | None = None
|
|
after_image_base64: str | None = None
|
|
delay_ms: int = Field(default=300, ge=0, le=60000)
|
|
diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_inputs(self):
|
|
if self.mode == "region":
|
|
required = [self.region_x, self.region_y, self.region_width, self.region_height]
|
|
if any(v is None for v in required):
|
|
raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
|
|
if self.mode == "image" and (not self.before_image_base64 or not self.after_image_base64):
|
|
raise ValueError("before_image_base64 and after_image_base64 are required for mode=image")
|
|
return self
|
|
|
|
|
|
class VisionStabilityRequest(BaseModel):
|
|
region_x: int | None = Field(default=None, ge=0)
|
|
region_y: int | None = Field(default=None, ge=0)
|
|
region_width: int | None = Field(default=None, gt=0)
|
|
region_height: int | None = Field(default=None, gt=0)
|
|
sample_interval_ms: int = Field(default=250, ge=50, le=10000)
|
|
duration_ms: int = Field(default=1200, ge=0, le=120000)
|
|
diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)
|
|
|
|
|
|
class VerifyActionRequest(BaseModel):
|
|
action: ActionRequest
|
|
condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
|
|
retries: int = Field(default=0, ge=0, le=10)
|
|
timeout_ms: int = Field(default=5000, ge=0, le=120000)
|
|
poll_interval_ms: int = Field(default=250, ge=50, le=10000)
|
|
retry_delay_ms: int = Field(default=200, ge=0, le=60000)
|
|
stop_on_action_error: bool = True
|
|
|
|
|
|
class ObserveRequestV2(BaseModel):
|
|
mode: Literal["screen", "region"] = "screen"
|
|
region_x: int | None = Field(default=None, ge=0)
|
|
region_y: int | None = Field(default=None, ge=0)
|
|
region_width: int | None = Field(default=None, gt=0)
|
|
region_height: int | None = Field(default=None, gt=0)
|
|
include_image: bool = True
|
|
image_format: Literal["png", "jpeg"] = "jpeg"
|
|
jpeg_quality: int = Field(default=75, ge=1, le=100)
|
|
ocr_mode: Literal["none", "region", "screen"] = "none"
|
|
language_hint: str | None = Field(default=None, min_length=1, max_length=64)
|
|
min_confidence: float = Field(default=0.4, ge=0.0, le=1.0)
|
|
max_ocr_area_px: int | None = Field(default=1_500_000, ge=1000)
|
|
group_lines: bool = True
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_region(self):
|
|
if self.mode == "region":
|
|
required = [self.region_x, self.region_y, self.region_width, self.region_height]
|
|
if any(v is None for v in required):
|
|
raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
|
|
return self
|
|
|
|
|
|
class ImageToolPoint(BaseModel):
|
|
x: int = Field(ge=0)
|
|
y: int = Field(ge=0)
|
|
|
|
|
|
class LocalizeRequestV2(BaseModel):
|
|
observation_id: str = Field(min_length=1, max_length=128)
|
|
text_query: str | None = Field(default=None, max_length=512)
|
|
text_match: Literal["contains", "exact", "regex"] = "contains"
|
|
image_tool_point: ImageToolPoint | None = None
|
|
candidate_index: int = Field(default=0, ge=0)
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_selector(self):
|
|
has_text = bool((self.text_query or "").strip())
|
|
has_point = self.image_tool_point is not None
|
|
if has_text == has_point:
|
|
raise ValueError("provide exactly one of text_query or image_tool_point")
|
|
return self
|
|
|
|
|
|
class ActionTargetV2(BaseModel):
|
|
resolved_target_id: str | None = Field(default=None, max_length=128)
|
|
pixel_x: int | None = None
|
|
pixel_y: int | None = None
|
|
|
|
@model_validator(mode="after")
|
|
def _validate_shape(self):
|
|
has_resolved = bool(self.resolved_target_id)
|
|
has_pixel = self.pixel_x is not None or self.pixel_y is not None
|
|
if has_resolved == has_pixel:
|
|
raise ValueError("provide either resolved_target_id or pixel_x/pixel_y")
|
|
if has_pixel and (self.pixel_x is None or self.pixel_y is None):
|
|
raise ValueError("pixel_x and pixel_y are both required")
|
|
return self
|
|
|
|
|
|
class ActionRequestV2(BaseModel):
|
|
action: Literal[
|
|
"move",
|
|
"click",
|
|
"right_click",
|
|
"double_click",
|
|
"middle_click",
|
|
"scroll",
|
|
"type",
|
|
"hotkey",
|
|
]
|
|
target: ActionTargetV2 | None = None
|
|
duration_ms: int = Field(default=0, ge=0, le=20000)
|
|
button: Literal["left", "right", "middle"] = "left"
|
|
clicks: int = Field(default=1, ge=1, le=10)
|
|
scroll_amount: int = 0
|
|
text: str = ""
|
|
keys: list[str] = Field(default_factory=list)
|
|
interval_ms: int = Field(default=20, ge=0, le=5000)
|
|
dry_run: bool = False
|
|
|
|
|
|
class ActRequestV2(BaseModel):
|
|
action: ActionRequestV2
|
|
|
|
|
|
class ActVerifyRequestV2(BaseModel):
|
|
action: ActionRequestV2
|
|
condition: WaitTextCondition | WaitWindowCondition | WaitVisualCondition
|
|
risk_level: Literal["low", "high"] = "low"
|
|
retries: int | None = Field(default=None, ge=0, le=10)
|
|
timeout_ms: int | None = Field(default=None, ge=0, le=120000)
|
|
poll_interval_ms: int | None = Field(default=None, ge=50, le=10000)
|
|
retry_delay_ms: int | None = Field(default=None, ge=0, le=60000)
|
|
stop_on_action_error: bool = True
|
|
|
|
|
|
OBSERVATIONS: dict[str, dict[str, Any]] = {}
|
|
RESOLVED_TARGETS: dict[str, dict[str, Any]] = {}
|
|
|
|
|
|
def _get_observation(observation_id: str) -> dict[str, Any]:
|
|
observation = OBSERVATIONS.get(observation_id)
|
|
if observation is None:
|
|
raise HTTPException(status_code=404, detail="observation_id not found")
|
|
return observation
|
|
|
|
|
|
def _resolve_v2_action(req: ActionRequestV2) -> ActionRequest:
|
|
target: Target | None = None
|
|
if req.target is not None:
|
|
if req.target.resolved_target_id:
|
|
item = RESOLVED_TARGETS.get(req.target.resolved_target_id)
|
|
if item is None:
|
|
raise HTTPException(status_code=404, detail="resolved_target_id not found")
|
|
target = PixelTarget(mode="pixel", x=item["x"], y=item["y"], dx=0, dy=0)
|
|
else:
|
|
target = PixelTarget(mode="pixel", x=req.target.pixel_x or 0, y=req.target.pixel_y or 0, dx=0, dy=0)
|
|
return ActionRequest(
|
|
action=req.action,
|
|
target=target,
|
|
duration_ms=req.duration_ms,
|
|
button=req.button,
|
|
clicks=req.clicks,
|
|
scroll_amount=req.scroll_amount,
|
|
text=req.text,
|
|
keys=req.keys,
|
|
interval_ms=req.interval_ms,
|
|
dry_run=req.dry_run,
|
|
)
|
|
|
|
|
|
def _risk_defaults(risk_level: str) -> dict[str, int]:
|
|
if risk_level == "high":
|
|
return {"retries": 1, "timeout_ms": 6000, "poll_interval_ms": 250, "retry_delay_ms": 300}
|
|
return {"retries": 0, "timeout_ms": 2500, "poll_interval_ms": 200, "retry_delay_ms": 150}
|
|
|
|
|
|
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
|
|
token = SETTINGS["token"]
|
|
if token and x_clickthrough_token != token:
|
|
raise HTTPException(status_code=401, detail="invalid token")
|
|
|
|
|
|
def _now_ms() -> int:
|
|
return int(time.time() * 1000)
|
|
|
|
|
|
def _request_id() -> str:
|
|
return str(uuid.uuid4())
|
|
|
|
|
|
def _import_capture_libs():
|
|
try:
|
|
from PIL import Image, ImageDraw
|
|
import mss
|
|
|
|
return Image, ImageDraw, mss
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail=f"capture backend unavailable: {exc}") from exc
|
|
|
|
|
|
def _display_region(mon: dict, screen: int, mss_index: int, primary: bool) -> dict:
|
|
return {
|
|
"screen": screen,
|
|
"mss_index": mss_index,
|
|
"primary": primary,
|
|
"x": mon["left"],
|
|
"y": mon["top"],
|
|
"width": mon["width"],
|
|
"height": mon["height"],
|
|
}
|
|
|
|
|
|
def _ordered_displays(sct) -> list[dict]:
|
|
raw_monitors = list(enumerate(sct.monitors[1:], start=1))
|
|
if not raw_monitors:
|
|
raise HTTPException(status_code=500, detail="no displays detected")
|
|
|
|
primary_pos = next(
|
|
(idx for idx, (_, mon) in enumerate(raw_monitors) if mon["left"] == 0 and mon["top"] == 0),
|
|
0,
|
|
)
|
|
ordered = [raw_monitors[primary_pos]] + [
|
|
item for idx, item in enumerate(raw_monitors) if idx != primary_pos
|
|
]
|
|
return [
|
|
_display_region(mon, screen=index, mss_index=mss_index, primary=(index == 0))
|
|
for index, (mss_index, mon) in enumerate(ordered)
|
|
]
|
|
|
|
|
|
def _get_displays() -> list[dict]:
|
|
_, _, mss = _import_capture_libs()
|
|
with mss.mss() as sct:
|
|
return _ordered_displays(sct)
|
|
|
|
|
|
def _select_display(screen: int) -> tuple[dict, list[dict], dict]:
|
|
displays = _get_displays()
|
|
selected = displays[screen] if 0 <= screen < len(displays) else displays[0]
|
|
selection = {
|
|
"requested": screen,
|
|
"selected": selected["screen"],
|
|
"fallback": selected["screen"] != screen,
|
|
}
|
|
return selected, displays, selection
|
|
|
|
|
|
def _capture_screen(screen: int = 0):
|
|
Image, _, mss = _import_capture_libs()
|
|
with mss.mss() as sct:
|
|
displays = _ordered_displays(sct)
|
|
mon = displays[screen] if 0 <= screen < len(displays) else displays[0]
|
|
shot = sct.grab(
|
|
{
|
|
"left": mon["x"],
|
|
"top": mon["y"],
|
|
"width": mon["width"],
|
|
"height": mon["height"],
|
|
}
|
|
)
|
|
image = Image.frombytes("RGB", shot.size, shot.rgb)
|
|
selection = {
|
|
"requested": screen,
|
|
"selected": mon["screen"],
|
|
"fallback": mon["screen"] != screen,
|
|
}
|
|
return image, mon, displays, selection
|
|
|
|
|
|
def _serialize_image(image, image_format: str, jpeg_quality: int) -> bytes:
|
|
buf = io.BytesIO()
|
|
if image_format == "jpeg":
|
|
image.save(buf, format="JPEG", quality=jpeg_quality)
|
|
else:
|
|
image.save(buf, format="PNG")
|
|
return buf.getvalue()
|
|
|
|
|
|
def _encode_image(image, image_format: str, jpeg_quality: int) -> str:
|
|
return base64.b64encode(_serialize_image(image, image_format, jpeg_quality)).decode("ascii")
|
|
|
|
|
|
def _draw_grid(image, region_x: int, region_y: int, rows: int, cols: int, include_labels: bool):
|
|
_, ImageDraw, _ = _import_capture_libs()
|
|
out = image.copy()
|
|
draw = ImageDraw.Draw(out)
|
|
w, h = out.size
|
|
|
|
cell_w = w / cols
|
|
cell_h = h / rows
|
|
|
|
for c in range(1, cols):
|
|
x = int(round(c * cell_w))
|
|
draw.line([(x, 0), (x, h)], fill=(255, 0, 0), width=1)
|
|
for r in range(1, rows):
|
|
y = int(round(r * cell_h))
|
|
draw.line([(0, y), (w, y)], fill=(255, 0, 0), width=1)
|
|
|
|
draw.rectangle([(0, 0), (w - 1, h - 1)], outline=(255, 0, 0), width=2)
|
|
|
|
if include_labels:
|
|
for r in range(rows):
|
|
for c in range(cols):
|
|
cx = int((c + 0.5) * cell_w)
|
|
cy = int((r + 0.5) * cell_h)
|
|
label = f"{r},{c}"
|
|
draw.text((cx - 12, cy - 6), label, fill=(255, 255, 0))
|
|
|
|
meta = {
|
|
"region": {"x": region_x, "y": region_y, "width": w, "height": h},
|
|
"grid": {
|
|
"rows": rows,
|
|
"cols": cols,
|
|
"cell_width": cell_w,
|
|
"cell_height": cell_h,
|
|
"indexing": "zero-based",
|
|
"point_formula": {
|
|
"pixel_x": "region.x + ((col + 0.5 + dx*0.5) * cell_width)",
|
|
"pixel_y": "region.y + ((row + 0.5 + dy*0.5) * cell_height)",
|
|
"dx_range": "[-1,1]",
|
|
"dy_range": "[-1,1]",
|
|
},
|
|
},
|
|
}
|
|
return out, meta
|
|
|
|
|
|
def _resolve_target(target: Target) -> tuple[int, int, dict]:
|
|
if isinstance(target, PixelTarget):
|
|
x = target.x + target.dx
|
|
y = target.y + target.dy
|
|
return x, y, {"mode": "pixel", "source": target.model_dump()}
|
|
|
|
cell_w = target.region_width / target.cols
|
|
cell_h = target.region_height / target.rows
|
|
|
|
x = target.region_x + int(round((target.col + 0.5 + (target.dx * 0.5)) * cell_w))
|
|
y = target.region_y + int(round((target.row + 0.5 + (target.dy * 0.5)) * cell_h))
|
|
|
|
return x, y, {
|
|
"mode": "grid",
|
|
"source": target.model_dump(),
|
|
"derived": {"cell_width": cell_w, "cell_height": cell_h},
|
|
}
|
|
|
|
|
|
def _enforce_allowed_region(x: int, y: int):
|
|
region = SETTINGS["allowed_region"]
|
|
if region is None:
|
|
return
|
|
rx, ry, rw, rh = region
|
|
if not (rx <= x < rx + rw and ry <= y < ry + rh):
|
|
raise HTTPException(status_code=403, detail="point outside allowed region")
|
|
|
|
|
|
def _import_input_lib():
|
|
try:
|
|
import pyautogui
|
|
|
|
pyautogui.FAILSAFE = True
|
|
return pyautogui
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc
|
|
|
|
|
|
def _import_ocr_libs():
|
|
try:
|
|
import pytesseract
|
|
from pytesseract import Output
|
|
|
|
tesseract_cmd = SETTINGS["tesseract_cmd"]
|
|
if tesseract_cmd:
|
|
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
|
|
|
return pytesseract, Output
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail=f"ocr backend unavailable: {exc}") from exc
|
|
|
|
|
|
def _decode_image_base64(value: str):
|
|
try:
|
|
from PIL import Image
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail=f"image decode backend unavailable: {exc}") from exc
|
|
|
|
payload = value.strip()
|
|
if payload.startswith("data:"):
|
|
parts = payload.split(",", 1)
|
|
if len(parts) != 2:
|
|
raise HTTPException(status_code=400, detail="invalid data URL image payload")
|
|
payload = parts[1]
|
|
|
|
try:
|
|
image_bytes = base64.b64decode(payload, validate=True)
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=400, detail="invalid image_base64 payload") from exc
|
|
|
|
try:
|
|
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=400, detail="unsupported or unreadable image bytes") from exc
|
|
|
|
return image
|
|
|
|
|
|
def _run_ocr(image, language_hint: str | None, min_confidence: float, offset_x: int = 0, offset_y: int = 0) -> list[dict]:
|
|
pytesseract, Output = _import_ocr_libs()
|
|
|
|
config = "--oem 3 --psm 6"
|
|
kwargs = {
|
|
"image": image,
|
|
"output_type": Output.DICT,
|
|
"config": config,
|
|
}
|
|
if language_hint:
|
|
kwargs["lang"] = language_hint
|
|
|
|
try:
|
|
data = pytesseract.image_to_data(**kwargs)
|
|
except pytesseract.TesseractNotFoundError as exc:
|
|
raise HTTPException(status_code=500, detail="tesseract executable not found") from exc
|
|
except pytesseract.TesseractError as exc:
|
|
raise HTTPException(status_code=400, detail=f"ocr failed: {exc}") from exc
|
|
|
|
blocks = []
|
|
count = len(data.get("text", []))
|
|
for idx in range(count):
|
|
text = (data["text"][idx] or "").strip()
|
|
if not text:
|
|
continue
|
|
|
|
raw_conf = str(data["conf"][idx]).strip()
|
|
try:
|
|
conf_0_100 = float(raw_conf)
|
|
except ValueError:
|
|
conf_0_100 = -1.0
|
|
if conf_0_100 < 0:
|
|
continue
|
|
|
|
confidence = round(conf_0_100 / 100.0, 4)
|
|
if confidence < min_confidence:
|
|
continue
|
|
|
|
left = int(data["left"][idx])
|
|
top = int(data["top"][idx])
|
|
width = int(data["width"][idx])
|
|
height = int(data["height"][idx])
|
|
|
|
blocks.append(
|
|
{
|
|
"text": text,
|
|
"confidence": confidence,
|
|
"bbox": {
|
|
"x": left + offset_x,
|
|
"y": top + offset_y,
|
|
"width": width,
|
|
"height": height,
|
|
},
|
|
"_sort": [top + offset_y, left + offset_x, idx],
|
|
}
|
|
)
|
|
|
|
blocks.sort(key=lambda b: (b["_sort"][0], b["_sort"][1], b["_sort"][2]))
|
|
for block in blocks:
|
|
block.pop("_sort", None)
|
|
return blocks
|
|
|
|
|
|
def _normalize_text(value: str) -> str:
|
|
return re.sub(r"\s+", " ", value).strip()
|
|
|
|
|
|
def _matches_text(haystack: str, needle: str, match_mode: str) -> bool:
|
|
if match_mode == "exact":
|
|
return haystack == needle
|
|
if match_mode == "regex":
|
|
return re.search(needle, haystack) is not None
|
|
return needle.lower() in haystack.lower()
|
|
|
|
|
|
def _windows_only(feature: str):
|
|
if sys.platform != "win32":
|
|
raise HTTPException(status_code=501, detail=f"{feature} is currently supported on Windows hosts only")
|
|
|
|
|
|
def _tasklist_process_name(pid: int) -> str | None:
|
|
try:
|
|
completed = subprocess.run(
|
|
["tasklist", "/FI", f"PID eq {pid}", "/FO", "CSV", "/NH"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
check=False,
|
|
)
|
|
except Exception:
|
|
return None
|
|
|
|
line = (completed.stdout or "").strip().splitlines()
|
|
if not line:
|
|
return None
|
|
row = line[0].strip()
|
|
if not row or row.startswith("INFO:"):
|
|
return None
|
|
if row.startswith('"') and '","' in row:
|
|
return row.split('","', 1)[0].strip('"')
|
|
return None
|
|
|
|
|
|
def _list_windows(query: WindowQuery | None = None) -> list[dict]:
|
|
_windows_only("window endpoints")
|
|
|
|
user32 = ctypes.windll.user32
|
|
user32.EnumWindows.restype = ctypes.c_bool
|
|
user32.EnumWindows.argtypes = [ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p), ctypes.c_void_p]
|
|
user32.IsWindowVisible.argtypes = [ctypes.c_void_p]
|
|
user32.IsWindowVisible.restype = ctypes.c_bool
|
|
user32.IsWindowEnabled.argtypes = [ctypes.c_void_p]
|
|
user32.IsWindowEnabled.restype = ctypes.c_bool
|
|
user32.IsIconic.argtypes = [ctypes.c_void_p]
|
|
user32.IsIconic.restype = ctypes.c_bool
|
|
user32.IsZoomed.argtypes = [ctypes.c_void_p]
|
|
user32.IsZoomed.restype = ctypes.c_bool
|
|
user32.GetWindowTextLengthW.argtypes = [ctypes.c_void_p]
|
|
user32.GetWindowTextLengthW.restype = ctypes.c_int
|
|
user32.GetWindowTextW.argtypes = [ctypes.c_void_p, ctypes.c_wchar_p, ctypes.c_int]
|
|
user32.GetClassNameW.argtypes = [ctypes.c_void_p, ctypes.c_wchar_p, ctypes.c_int]
|
|
user32.GetClassNameW.restype = ctypes.c_int
|
|
user32.GetForegroundWindow.restype = ctypes.c_void_p
|
|
user32.GetWindowRect.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.wintypes.RECT)]
|
|
|
|
foreground = int(user32.GetForegroundWindow() or 0)
|
|
title_regex = re.compile(query.title_regex, re.IGNORECASE) if query and query.title_regex else None
|
|
windows: list[dict] = []
|
|
|
|
enum_proc = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
|
|
|
|
def _callback(hwnd, _lparam):
|
|
hwnd_int = int(hwnd)
|
|
if query and query.hwnd is not None and hwnd_int != query.hwnd:
|
|
return True
|
|
|
|
title_len = user32.GetWindowTextLengthW(hwnd)
|
|
title_buf = ctypes.create_unicode_buffer(max(title_len + 1, 1))
|
|
user32.GetWindowTextW(hwnd, title_buf, len(title_buf))
|
|
title = title_buf.value
|
|
|
|
visible = bool(user32.IsWindowVisible(hwnd))
|
|
if query and query.visible_only and not visible:
|
|
return True
|
|
|
|
class_buf = ctypes.create_unicode_buffer(256)
|
|
user32.GetClassNameW(hwnd, class_buf, len(class_buf))
|
|
|
|
pid = ctypes.wintypes.DWORD()
|
|
user32.GetWindowThreadProcessId(hwnd, ctypes.byref(pid))
|
|
process_name = _tasklist_process_name(int(pid.value))
|
|
|
|
rect = ctypes.wintypes.RECT()
|
|
user32.GetWindowRect(hwnd, ctypes.byref(rect))
|
|
|
|
window = {
|
|
"hwnd": hwnd_int,
|
|
"title": title,
|
|
"class_name": class_buf.value,
|
|
"pid": int(pid.value),
|
|
"process_name": process_name,
|
|
"visible": visible,
|
|
"enabled": bool(user32.IsWindowEnabled(hwnd)),
|
|
"minimized": bool(user32.IsIconic(hwnd)),
|
|
"maximized": bool(user32.IsZoomed(hwnd)),
|
|
"foreground": hwnd_int == foreground,
|
|
"rect": {
|
|
"x": int(rect.left),
|
|
"y": int(rect.top),
|
|
"width": int(rect.right - rect.left),
|
|
"height": int(rect.bottom - rect.top),
|
|
},
|
|
}
|
|
|
|
if query:
|
|
if query.title_contains and query.title_contains.lower() not in title.lower():
|
|
return True
|
|
if title_regex and not title_regex.search(title):
|
|
return True
|
|
if query.process_name and (process_name or "").lower() != query.process_name.lower():
|
|
return True
|
|
|
|
windows.append(window)
|
|
return True
|
|
|
|
user32.EnumWindows(enum_proc(_callback), 0)
|
|
windows.sort(key=lambda item: (not item["foreground"], item["title"].lower(), item["hwnd"]))
|
|
return windows
|
|
|
|
|
|
def _require_window_match(query: WindowQuery) -> dict:
|
|
matches = _list_windows(query)
|
|
if not matches:
|
|
raise HTTPException(status_code=404, detail="no matching window found")
|
|
if len(matches) > 1 and query.hwnd is None:
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail={"message": "multiple windows matched", "matches": matches[:10]},
|
|
)
|
|
return matches[0]
|
|
|
|
|
|
def _apply_window_action(req: WindowActionRequest) -> dict:
|
|
_windows_only("window endpoints")
|
|
match = _require_window_match(req)
|
|
hwnd = match["hwnd"]
|
|
user32 = ctypes.windll.user32
|
|
WM_CLOSE = 0x0010
|
|
SW_RESTORE = 9
|
|
SW_MINIMIZE = 6
|
|
SW_MAXIMIZE = 3
|
|
|
|
if req.action in {"focus", "restore"}:
|
|
user32.ShowWindow(hwnd, SW_RESTORE)
|
|
ok = bool(user32.SetForegroundWindow(hwnd))
|
|
elif req.action == "minimize":
|
|
ok = bool(user32.ShowWindow(hwnd, SW_MINIMIZE))
|
|
elif req.action == "maximize":
|
|
ok = bool(user32.ShowWindow(hwnd, SW_MAXIMIZE))
|
|
elif req.action == "close":
|
|
ok = bool(user32.PostMessageW(hwnd, WM_CLOSE, 0, 0))
|
|
else:
|
|
raise HTTPException(status_code=400, detail="unsupported window action")
|
|
|
|
deadline = time.time() + (req.timeout_ms / 1000.0)
|
|
final_match = None
|
|
while time.time() <= deadline:
|
|
current = _list_windows(WindowQuery(hwnd=hwnd, visible_only=False))
|
|
final_match = current[0] if current else None
|
|
if req.action == "close" and final_match is None:
|
|
break
|
|
if req.action in {"focus", "restore"} and final_match and final_match["foreground"] and not final_match["minimized"]:
|
|
break
|
|
if req.action == "minimize" and final_match and final_match["minimized"]:
|
|
break
|
|
if req.action == "maximize" and final_match and final_match["maximized"]:
|
|
break
|
|
time.sleep(0.1)
|
|
|
|
return {
|
|
"ok": ok,
|
|
"matched": match,
|
|
"window": final_match,
|
|
"closed": final_match is None,
|
|
}
|
|
|
|
|
|
def _launch_app(req: LaunchRequest) -> dict:
|
|
if req.cwd:
|
|
cwd = os.path.abspath(req.cwd)
|
|
if not os.path.isdir(cwd):
|
|
raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory")
|
|
else:
|
|
cwd = None
|
|
|
|
argv = [req.executable, *req.args]
|
|
if SETTINGS["dry_run"] or req.dry_run:
|
|
return {"executed": False, "dry_run": True, "argv": argv, "cwd": cwd}
|
|
|
|
try:
|
|
proc = subprocess.Popen(argv, cwd=cwd)
|
|
except FileNotFoundError as exc:
|
|
raise HTTPException(status_code=400, detail=f"executable not found: {exc}") from exc
|
|
except OSError as exc:
|
|
raise HTTPException(status_code=400, detail=f"failed to launch process: {exc}") from exc
|
|
|
|
result = {
|
|
"executed": True,
|
|
"dry_run": False,
|
|
"argv": argv,
|
|
"cwd": cwd,
|
|
"pid": proc.pid,
|
|
}
|
|
|
|
if req.wait_for_window:
|
|
query = req.match or WindowQuery(process_name=os.path.basename(req.executable), visible_only=True)
|
|
deadline = time.time() + (req.timeout_ms / 1000.0)
|
|
match = None
|
|
while time.time() <= deadline:
|
|
matches = _list_windows(query)
|
|
if matches:
|
|
match = matches[0]
|
|
break
|
|
time.sleep(0.2)
|
|
result["window"] = match
|
|
result["window_found"] = match is not None
|
|
|
|
return result
|
|
|
|
|
|
def _capture_region_image(screen: int, region_x: int | None, region_y: int | None, region_width: int | None, region_height: int | None):
|
|
base_img, mon, displays, screen_selection = _capture_screen(screen)
|
|
if None in {region_x, region_y, region_width, region_height}:
|
|
return base_img, {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}, mon, displays, screen_selection
|
|
|
|
left = region_x - mon["x"]
|
|
top = region_y - mon["y"]
|
|
right = left + region_width
|
|
bottom = top + region_height
|
|
if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
|
|
raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")
|
|
|
|
crop = base_img.crop((left, top, right, bottom))
|
|
region = {"x": region_x, "y": region_y, "width": region_width, "height": region_height}
|
|
return crop, region, mon, displays, screen_selection
|
|
|
|
|
|
def _capture_ocr_source(req: OCRRequest, screen: int = 0):
|
|
source = req.mode
|
|
if source == "image":
|
|
image = _decode_image_base64(req.image_base64 or "")
|
|
region = {"x": 0, "y": 0, "width": image.size[0], "height": image.size[1]}
|
|
return image, region, None, None, None, source
|
|
|
|
base_img, mon, displays, screen_selection = _capture_screen(screen)
|
|
if source == "screen":
|
|
image = base_img
|
|
region = {"x": mon["x"], "y": mon["y"], "width": mon["width"], "height": mon["height"]}
|
|
return image, region, mon, displays, screen_selection, source
|
|
|
|
left = req.region_x - mon["x"]
|
|
top = req.region_y - mon["y"]
|
|
right = left + req.region_width
|
|
bottom = top + req.region_height
|
|
if left < 0 or top < 0 or right > base_img.size[0] or bottom > base_img.size[1]:
|
|
raise HTTPException(status_code=400, detail="requested region is outside the captured monitor")
|
|
|
|
image = base_img.crop((left, top, right, bottom))
|
|
region = {
|
|
"x": req.region_x,
|
|
"y": req.region_y,
|
|
"width": req.region_width,
|
|
"height": req.region_height,
|
|
}
|
|
return image, region, mon, displays, screen_selection, source
|
|
|
|
|
|
def _image_diff_ratio(before, after) -> float:
|
|
diff = ImageChops.difference(before, after)
|
|
stat = ImageStat.Stat(diff)
|
|
channel_means = stat.mean if isinstance(stat.mean, list) else [stat.mean]
|
|
return float(sum(channel_means) / (len(channel_means) * 255.0))
|
|
|
|
|
|
def _merge_bbox(blocks: list[dict]) -> dict:
|
|
xs = [b["bbox"]["x"] for b in blocks]
|
|
ys = [b["bbox"]["y"] for b in blocks]
|
|
rights = [b["bbox"]["x"] + b["bbox"]["width"] for b in blocks]
|
|
bottoms = [b["bbox"]["y"] + b["bbox"]["height"] for b in blocks]
|
|
return {
|
|
"x": min(xs),
|
|
"y": min(ys),
|
|
"width": max(rights) - min(xs),
|
|
"height": max(bottoms) - min(ys),
|
|
}
|
|
|
|
|
|
def _group_ocr_lines(blocks: list[dict]) -> list[dict]:
|
|
if not blocks:
|
|
return []
|
|
|
|
sorted_blocks = sorted(blocks, key=lambda b: (b["bbox"]["y"], b["bbox"]["x"]))
|
|
lines: list[list[dict]] = []
|
|
current: list[dict] = []
|
|
current_center = None
|
|
|
|
for block in sorted_blocks:
|
|
bbox = block["bbox"]
|
|
center_y = bbox["y"] + (bbox["height"] / 2)
|
|
tolerance = max(10.0, bbox["height"] * 0.8)
|
|
if current and current_center is not None and abs(center_y - current_center) > tolerance:
|
|
lines.append(sorted(current, key=lambda item: item["bbox"]["x"]))
|
|
current = []
|
|
current_center = None
|
|
current.append(block)
|
|
current_center = sum(item["bbox"]["y"] + (item["bbox"]["height"] / 2) for item in current) / len(current)
|
|
|
|
if current:
|
|
lines.append(sorted(current, key=lambda item: item["bbox"]["x"]))
|
|
|
|
grouped = []
|
|
for idx, line_blocks in enumerate(lines):
|
|
text = " ".join(item["text"] for item in line_blocks).strip()
|
|
if not text:
|
|
continue
|
|
grouped.append(
|
|
{
|
|
"text": text,
|
|
"confidence": round(sum(item["confidence"] for item in line_blocks) / len(line_blocks), 4),
|
|
"bbox": _merge_bbox(line_blocks),
|
|
"blocks": line_blocks,
|
|
"line_index": idx,
|
|
}
|
|
)
|
|
return grouped
|
|
|
|
|
|
def _find_text_matches(blocks: list[dict], query: str, match_mode: str, group_lines: bool, max_results: int) -> list[dict]:
|
|
target = _normalize_text(query)
|
|
candidates = _group_ocr_lines(blocks) if group_lines else blocks
|
|
matches = []
|
|
for item in candidates:
|
|
normalized = _normalize_text(item["text"])
|
|
if not normalized:
|
|
continue
|
|
if _matches_text(normalized, target, match_mode):
|
|
match = {
|
|
"text": item["text"],
|
|
"normalized_text": normalized,
|
|
"confidence": item["confidence"],
|
|
"bbox": item["bbox"],
|
|
"grouped": group_lines,
|
|
}
|
|
if group_lines:
|
|
match["blocks"] = item["blocks"]
|
|
match["line_index"] = item["line_index"]
|
|
matches.append(match)
|
|
|
|
matches.sort(key=lambda item: (-item["confidence"], item["bbox"]["y"], item["bbox"]["x"]))
|
|
return matches[:max_results]
|
|
|
|
|
|
def _compute_visual_diff(req: VisionDiffRequest, screen: int = 0) -> dict:
|
|
if req.mode == "image":
|
|
before = _decode_image_base64(req.before_image_base64 or "")
|
|
after = _decode_image_base64(req.after_image_base64 or "")
|
|
if before.size != after.size:
|
|
raise HTTPException(status_code=400, detail="before and after images must have matching dimensions")
|
|
diff_ratio = _image_diff_ratio(before, after)
|
|
return {
|
|
"mode": req.mode,
|
|
"region": {"x": 0, "y": 0, "width": before.size[0], "height": before.size[1]},
|
|
"diff_ratio": diff_ratio,
|
|
"changed": diff_ratio >= req.diff_threshold,
|
|
"diff_threshold": req.diff_threshold,
|
|
}
|
|
|
|
before, region, mon, displays, screen_selection = _capture_region_image(
|
|
screen,
|
|
req.region_x,
|
|
req.region_y,
|
|
req.region_width,
|
|
req.region_height,
|
|
)
|
|
if req.delay_ms > 0:
|
|
time.sleep(req.delay_ms / 1000.0)
|
|
after, _, _, _, _ = _capture_region_image(
|
|
screen,
|
|
region["x"],
|
|
region["y"],
|
|
region["width"],
|
|
region["height"],
|
|
)
|
|
diff_ratio = _image_diff_ratio(before, after)
|
|
return {
|
|
"mode": req.mode,
|
|
"region": region,
|
|
"diff_ratio": diff_ratio,
|
|
"changed": diff_ratio >= req.diff_threshold,
|
|
"diff_threshold": req.diff_threshold,
|
|
"screen": screen_selection,
|
|
"display": mon,
|
|
"delay_ms": req.delay_ms,
|
|
}
|
|
|
|
|
|
def _measure_stability(req: VisionStabilityRequest, screen: int = 0) -> dict:
|
|
baseline, region, mon, displays, screen_selection = _capture_region_image(
|
|
screen,
|
|
req.region_x,
|
|
req.region_y,
|
|
req.region_width,
|
|
req.region_height,
|
|
)
|
|
sample_count = 0
|
|
max_diff_ratio = 0.0
|
|
diffs = []
|
|
deadline = time.time() + (req.duration_ms / 1000.0)
|
|
while time.time() < deadline:
|
|
time.sleep(req.sample_interval_ms / 1000.0)
|
|
current, _, _, _, _ = _capture_region_image(
|
|
screen,
|
|
region["x"],
|
|
region["y"],
|
|
region["width"],
|
|
region["height"],
|
|
)
|
|
diff_ratio = _image_diff_ratio(baseline, current)
|
|
diffs.append(diff_ratio)
|
|
max_diff_ratio = max(max_diff_ratio, diff_ratio)
|
|
sample_count += 1
|
|
baseline = current
|
|
|
|
return {
|
|
"stable": max_diff_ratio <= req.diff_threshold,
|
|
"region": region,
|
|
"sample_count": sample_count,
|
|
"max_diff_ratio": max_diff_ratio,
|
|
"avg_diff_ratio": round(sum(diffs) / len(diffs), 6) if diffs else 0.0,
|
|
"diff_threshold": req.diff_threshold,
|
|
"duration_ms": req.duration_ms,
|
|
"sample_interval_ms": req.sample_interval_ms,
|
|
"screen": screen_selection,
|
|
"display": mon,
|
|
}
|
|
|
|
|
|
def _run_verified_action(req: VerifyActionRequest, screen: int = 0) -> dict:
|
|
attempts = []
|
|
for attempt in range(req.retries + 1):
|
|
action_ok = True
|
|
action_result = None
|
|
action_error = None
|
|
try:
|
|
action_result = _exec_action(req.action, screen)
|
|
except Exception as exc:
|
|
action_ok = False
|
|
action_error = str(exc)
|
|
if req.stop_on_action_error:
|
|
attempts.append(
|
|
{
|
|
"attempt": attempt,
|
|
"action_ok": action_ok,
|
|
"action_error": action_error,
|
|
"verification": None,
|
|
}
|
|
)
|
|
return {"success": False, "attempts": attempts, "final_attempt": attempt}
|
|
|
|
verification = _wait_for_condition(
|
|
WaitRequest(
|
|
condition=req.condition,
|
|
timeout_ms=req.timeout_ms,
|
|
poll_interval_ms=req.poll_interval_ms,
|
|
),
|
|
screen,
|
|
)
|
|
attempts.append(
|
|
{
|
|
"attempt": attempt,
|
|
"action_ok": action_ok,
|
|
"action_error": action_error,
|
|
"action_result": action_result,
|
|
"verification": verification,
|
|
}
|
|
)
|
|
if verification.get("satisfied"):
|
|
return {"success": True, "attempts": attempts, "final_attempt": attempt}
|
|
if attempt < req.retries and req.retry_delay_ms > 0:
|
|
time.sleep(req.retry_delay_ms / 1000.0)
|
|
|
|
return {"success": False, "attempts": attempts, "final_attempt": req.retries}
|
|
|
|
|
|
def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict:
|
|
condition = req.condition
|
|
deadline = time.time() + (req.timeout_ms / 1000.0)
|
|
polls = 0
|
|
|
|
if isinstance(condition, WaitVisualCondition):
|
|
baseline, region, mon, displays, screen_selection = _capture_region_image(
|
|
screen,
|
|
condition.region_x,
|
|
condition.region_y,
|
|
condition.region_width,
|
|
condition.region_height,
|
|
)
|
|
stable_since = None
|
|
last_diff = 0.0
|
|
while True:
|
|
if time.time() > deadline:
|
|
return {
|
|
"satisfied": False,
|
|
"kind": condition.kind,
|
|
"state": condition.state,
|
|
"polls": polls,
|
|
"region": region,
|
|
"diff_ratio": last_diff,
|
|
"screen": screen_selection,
|
|
"display": mon,
|
|
}
|
|
time.sleep(req.poll_interval_ms / 1000.0)
|
|
current, _, _, _, _ = _capture_region_image(
|
|
screen,
|
|
region["x"],
|
|
region["y"],
|
|
region["width"],
|
|
region["height"],
|
|
)
|
|
polls += 1
|
|
last_diff = _image_diff_ratio(baseline, current)
|
|
if condition.state == "change":
|
|
if last_diff >= condition.diff_threshold:
|
|
return {
|
|
"satisfied": True,
|
|
"kind": condition.kind,
|
|
"state": condition.state,
|
|
"polls": polls,
|
|
"region": region,
|
|
"diff_ratio": last_diff,
|
|
"screen": screen_selection,
|
|
"display": mon,
|
|
}
|
|
else:
|
|
if last_diff <= condition.diff_threshold:
|
|
stable_since = stable_since or time.time()
|
|
if (time.time() - stable_since) * 1000 >= condition.stable_for_ms:
|
|
return {
|
|
"satisfied": True,
|
|
"kind": condition.kind,
|
|
"state": condition.state,
|
|
"polls": polls,
|
|
"region": region,
|
|
"diff_ratio": last_diff,
|
|
"stable_for_ms": int((time.time() - stable_since) * 1000),
|
|
"screen": screen_selection,
|
|
"display": mon,
|
|
}
|
|
else:
|
|
stable_since = None
|
|
baseline = current
|
|
|
|
while True:
|
|
if isinstance(condition, WaitWindowCondition):
|
|
matches = _list_windows(condition)
|
|
polls += 1
|
|
satisfied = False
|
|
if condition.state == "exists":
|
|
satisfied = bool(matches)
|
|
elif condition.state == "focused":
|
|
satisfied = any(item["foreground"] for item in matches)
|
|
elif condition.state == "closed":
|
|
satisfied = not matches
|
|
if satisfied:
|
|
return {
|
|
"satisfied": True,
|
|
"kind": condition.kind,
|
|
"state": condition.state,
|
|
"polls": polls,
|
|
"matches": matches[:10],
|
|
}
|
|
elif isinstance(condition, WaitTextCondition):
|
|
image, region, mon, displays, screen_selection = _capture_region_image(
|
|
screen,
|
|
condition.region_x,
|
|
condition.region_y,
|
|
condition.region_width,
|
|
condition.region_height,
|
|
)
|
|
blocks = _run_ocr(
|
|
image,
|
|
condition.language_hint,
|
|
condition.min_confidence,
|
|
region["x"],
|
|
region["y"],
|
|
)
|
|
polls += 1
|
|
matched = []
|
|
for block in blocks:
|
|
normalized = _normalize_text(block["text"])
|
|
target = _normalize_text(condition.text)
|
|
if _matches_text(normalized, target, condition.match):
|
|
matched.append(block)
|
|
satisfied = bool(matched) if condition.present else not bool(matched)
|
|
if satisfied:
|
|
return {
|
|
"satisfied": True,
|
|
"kind": condition.kind,
|
|
"mode": condition.mode,
|
|
"polls": polls,
|
|
"region": region,
|
|
"matches": matched,
|
|
"screen": screen_selection,
|
|
"display": mon,
|
|
}
|
|
else:
|
|
raise HTTPException(status_code=400, detail="unsupported wait condition")
|
|
|
|
if time.time() > deadline:
|
|
return {
|
|
"satisfied": False,
|
|
"kind": condition.kind,
|
|
"polls": polls,
|
|
}
|
|
time.sleep(req.poll_interval_ms / 1000.0)
|
|
|
|
|
|
def _pick_shell(explicit_shell: str | None) -> str:
|
|
shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip()
|
|
if shell_name not in {"powershell", "bash", "cmd"}:
|
|
raise HTTPException(status_code=400, detail="unsupported shell")
|
|
return shell_name
|
|
|
|
|
|
def _truncate_text(text: str, limit: int) -> tuple[str, bool]:
|
|
if len(text) <= limit:
|
|
return text, False
|
|
return text[:limit], True
|
|
|
|
|
|
def _resolve_exec_program(shell_name: str, command: str) -> list[str]:
|
|
if shell_name == "powershell":
|
|
return ["powershell", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-Command", command]
|
|
if shell_name == "bash":
|
|
return ["bash", "-lc", command]
|
|
if shell_name == "cmd":
|
|
return ["cmd", "/c", command]
|
|
raise HTTPException(status_code=400, detail="unsupported shell")
|
|
|
|
|
|
def _exec_command(req: ExecRequest) -> dict:
|
|
if not SETTINGS["exec_enabled"]:
|
|
raise HTTPException(status_code=403, detail="exec endpoint disabled")
|
|
if not SETTINGS["exec_secret"]:
|
|
raise HTTPException(status_code=403, detail="exec secret not configured")
|
|
|
|
run_dry = SETTINGS["dry_run"] or req.dry_run
|
|
shell_name = _pick_shell(req.shell)
|
|
|
|
timeout_s = req.timeout_s if req.timeout_s is not None else SETTINGS["exec_default_timeout_s"]
|
|
timeout_s = min(timeout_s, SETTINGS["exec_max_timeout_s"])
|
|
|
|
cwd = None
|
|
if req.cwd:
|
|
cwd = os.path.abspath(req.cwd)
|
|
if not os.path.isdir(cwd):
|
|
raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory")
|
|
|
|
argv = _resolve_exec_program(shell_name, req.command)
|
|
|
|
if run_dry:
|
|
return {
|
|
"executed": False,
|
|
"dry_run": True,
|
|
"shell": shell_name,
|
|
"command": req.command,
|
|
"argv": argv,
|
|
"timeout_s": timeout_s,
|
|
"cwd": cwd,
|
|
}
|
|
|
|
start = time.time()
|
|
try:
|
|
completed = subprocess.run(
|
|
argv,
|
|
cwd=cwd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout_s,
|
|
check=False,
|
|
)
|
|
except subprocess.TimeoutExpired as exc:
|
|
stdout = exc.stdout or ""
|
|
stderr = exc.stderr or ""
|
|
stdout, stdout_truncated = _truncate_text(str(stdout), SETTINGS["exec_max_output_chars"])
|
|
stderr, stderr_truncated = _truncate_text(str(stderr), SETTINGS["exec_max_output_chars"])
|
|
return {
|
|
"executed": True,
|
|
"timed_out": True,
|
|
"shell": shell_name,
|
|
"command": req.command,
|
|
"argv": argv,
|
|
"timeout_s": timeout_s,
|
|
"cwd": cwd,
|
|
"duration_ms": int((time.time() - start) * 1000),
|
|
"exit_code": None,
|
|
"stdout": stdout,
|
|
"stderr": stderr,
|
|
"stdout_truncated": stdout_truncated,
|
|
"stderr_truncated": stderr_truncated,
|
|
}
|
|
except FileNotFoundError as exc:
|
|
raise HTTPException(status_code=400, detail=f"shell executable not found: {exc}") from exc
|
|
|
|
stdout, stdout_truncated = _truncate_text(completed.stdout or "", SETTINGS["exec_max_output_chars"])
|
|
stderr, stderr_truncated = _truncate_text(completed.stderr or "", SETTINGS["exec_max_output_chars"])
|
|
|
|
return {
|
|
"executed": True,
|
|
"timed_out": False,
|
|
"shell": shell_name,
|
|
"command": req.command,
|
|
"argv": argv,
|
|
"timeout_s": timeout_s,
|
|
"cwd": cwd,
|
|
"duration_ms": int((time.time() - start) * 1000),
|
|
"exit_code": completed.returncode,
|
|
"stdout": stdout,
|
|
"stderr": stderr,
|
|
"stdout_truncated": stdout_truncated,
|
|
"stderr_truncated": stderr_truncated,
|
|
}
|
|
|
|
|
|
def _exec_action(req: ActionRequest, screen: int = 0) -> dict:
|
|
run_dry = SETTINGS["dry_run"] or req.dry_run
|
|
selected_display, displays, screen_selection = _select_display(screen)
|
|
|
|
pyautogui = None if run_dry else _import_input_lib()
|
|
resolved_target = None
|
|
|
|
if req.target is not None:
|
|
x, y, info = _resolve_target(req.target)
|
|
_enforce_allowed_region(x, y)
|
|
resolved_target = {"x": x, "y": y, "target_info": info}
|
|
|
|
duration_sec = req.duration_ms / 1000.0
|
|
|
|
if req.action in {"move", "click", "right_click", "double_click", "middle_click"} and resolved_target is None:
|
|
raise HTTPException(status_code=400, detail="target is required for pointer actions")
|
|
|
|
if req.action == "scroll" and resolved_target is None:
|
|
raise HTTPException(status_code=400, detail="target is required for scroll")
|
|
|
|
if not run_dry:
|
|
if req.action == "move":
|
|
pyautogui.moveTo(resolved_target["x"], resolved_target["y"], duration=duration_sec)
|
|
|
|
elif req.action == "click":
|
|
pyautogui.click(
|
|
x=resolved_target["x"],
|
|
y=resolved_target["y"],
|
|
clicks=req.clicks,
|
|
interval=req.interval_ms / 1000.0,
|
|
button=req.button,
|
|
duration=duration_sec,
|
|
)
|
|
|
|
elif req.action == "right_click":
|
|
pyautogui.click(x=resolved_target["x"], y=resolved_target["y"], button="right", duration=duration_sec)
|
|
|
|
elif req.action == "double_click":
|
|
pyautogui.doubleClick(x=resolved_target["x"], y=resolved_target["y"], interval=req.interval_ms / 1000.0)
|
|
|
|
elif req.action == "middle_click":
|
|
pyautogui.click(x=resolved_target["x"], y=resolved_target["y"], button="middle", duration=duration_sec)
|
|
|
|
elif req.action == "scroll":
|
|
pyautogui.moveTo(resolved_target["x"], resolved_target["y"], duration=duration_sec)
|
|
pyautogui.scroll(req.scroll_amount)
|
|
|
|
elif req.action == "type":
|
|
pyautogui.write(req.text, interval=req.interval_ms / 1000.0)
|
|
|
|
elif req.action == "hotkey":
|
|
if len(req.keys) < 1:
|
|
raise HTTPException(status_code=400, detail="keys is required for hotkey")
|
|
pyautogui.hotkey(*req.keys)
|
|
|
|
return {
|
|
"action": req.action,
|
|
"executed": not run_dry,
|
|
"dry_run": run_dry,
|
|
"screen": screen_selection,
|
|
"display": selected_display,
|
|
"resolved_target": resolved_target,
|
|
}
|
|
|
|
|
|
def _localization_confidence(source: str, confidence: float | None = None) -> str:
|
|
if source == "image_tool_point":
|
|
return "high"
|
|
if source == "ocr" and confidence is not None:
|
|
if confidence >= 0.8:
|
|
return "high"
|
|
if confidence >= 0.55:
|
|
return "medium"
|
|
return "low"
|
|
|
|
|
|
@app.post("/v2/observe")
|
|
def observe_v2(req: ObserveRequestV2, screen: int = 0, _: None = Depends(_auth)):
|
|
capture_started = time.perf_counter()
|
|
image, region, mon, displays, screen_selection = _capture_region_image(
|
|
screen,
|
|
req.region_x if req.mode == "region" else None,
|
|
req.region_y if req.mode == "region" else None,
|
|
req.region_width if req.mode == "region" else None,
|
|
req.region_height if req.mode == "region" else None,
|
|
)
|
|
capture_ms = int((time.perf_counter() - capture_started) * 1000)
|
|
|
|
encoded = None
|
|
if req.include_image:
|
|
encoded = _encode_image(image, req.image_format, req.jpeg_quality)
|
|
|
|
ocr_started = time.perf_counter()
|
|
blocks: list[dict] = []
|
|
grouped_lines: list[dict] = []
|
|
ocr_applied_mode = "none"
|
|
if req.ocr_mode != "none":
|
|
if req.ocr_mode == "screen":
|
|
ocr_image, ocr_region, _, _, _ = _capture_region_image(screen, None, None, None, None)
|
|
else:
|
|
ocr_image, ocr_region = image, region
|
|
|
|
area = ocr_region["width"] * ocr_region["height"]
|
|
if req.max_ocr_area_px is not None and area > req.max_ocr_area_px:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"ocr area {area} exceeds max_ocr_area_px {req.max_ocr_area_px}",
|
|
)
|
|
|
|
blocks = _run_ocr(
|
|
ocr_image,
|
|
req.language_hint,
|
|
req.min_confidence,
|
|
ocr_region["x"],
|
|
ocr_region["y"],
|
|
)
|
|
if req.group_lines:
|
|
grouped_lines = _group_ocr_lines(blocks)
|
|
ocr_applied_mode = req.ocr_mode
|
|
ocr_ms = int((time.perf_counter() - ocr_started) * 1000)
|
|
|
|
observation_id = _request_id()
|
|
OBSERVATIONS[observation_id] = {
|
|
"id": observation_id,
|
|
"region": region,
|
|
"screen": screen_selection,
|
|
"display": mon,
|
|
"image_width": image.size[0],
|
|
"image_height": image.size[1],
|
|
"ocr_blocks": blocks,
|
|
"ocr_lines": grouped_lines,
|
|
"created_at_ms": _now_ms(),
|
|
}
|
|
|
|
return _ok(
|
|
{
|
|
"observation_id": observation_id,
|
|
"region": region,
|
|
"screen": screen_selection,
|
|
"display": mon,
|
|
"image": {
|
|
"included": req.include_image,
|
|
"format": req.image_format if req.include_image else None,
|
|
"base64": encoded,
|
|
"width": image.size[0],
|
|
"height": image.size[1],
|
|
},
|
|
"ocr": {
|
|
"mode": ocr_applied_mode,
|
|
"min_confidence": req.min_confidence,
|
|
"language_hint": req.language_hint,
|
|
"block_count": len(blocks),
|
|
"line_count": len(grouped_lines),
|
|
"blocks": blocks,
|
|
"lines": grouped_lines,
|
|
},
|
|
"timing_ms": {
|
|
"capture_ms": capture_ms,
|
|
"ocr_ms": ocr_ms if req.ocr_mode != "none" else 0,
|
|
"total_ms": capture_ms + (ocr_ms if req.ocr_mode != "none" else 0),
|
|
},
|
|
}
|
|
)
|
|
|
|
|
|
@app.post("/v2/localize")
|
|
def localize_v2(req: LocalizeRequestV2, _: None = Depends(_auth)):
|
|
observation = _get_observation(req.observation_id)
|
|
region = observation["region"]
|
|
image_width = observation["image_width"]
|
|
image_height = observation["image_height"]
|
|
|
|
if req.image_tool_point is not None:
|
|
if req.image_tool_point.x >= image_width or req.image_tool_point.y >= image_height:
|
|
raise HTTPException(status_code=400, detail="image_tool_point outside observation image bounds")
|
|
x = region["x"] + req.image_tool_point.x
|
|
y = region["y"] + req.image_tool_point.y
|
|
_enforce_allowed_region(x, y)
|
|
resolved_target_id = _request_id()
|
|
RESOLVED_TARGETS[resolved_target_id] = {
|
|
"id": resolved_target_id,
|
|
"observation_id": req.observation_id,
|
|
"x": x,
|
|
"y": y,
|
|
"source": "image_tool_point",
|
|
}
|
|
return _ok(
|
|
{
|
|
"resolved_target_id": resolved_target_id,
|
|
"source": "image_tool_point",
|
|
"localization_confidence": _localization_confidence("image_tool_point"),
|
|
"pixel": {"x": x, "y": y},
|
|
"observation_region": region,
|
|
"image_bounds": {"width": image_width, "height": image_height},
|
|
}
|
|
)
|
|
|
|
lines = observation.get("ocr_lines") or _group_ocr_lines(observation.get("ocr_blocks", []))
|
|
matches = _find_text_matches(lines, req.text_query or "", req.text_match, False, 200)
|
|
if not matches:
|
|
return _err("not_found", "no localization candidates found", 404, {"found": False, "matches": []})
|
|
if req.candidate_index >= len(matches):
|
|
raise HTTPException(status_code=400, detail="candidate_index is outside match results")
|
|
|
|
chosen = matches[req.candidate_index]
|
|
bbox = chosen["bbox"]
|
|
x = bbox["x"] + max(1, bbox["width"] // 2)
|
|
y = bbox["y"] + max(1, bbox["height"] // 2)
|
|
_enforce_allowed_region(x, y)
|
|
resolved_target_id = _request_id()
|
|
RESOLVED_TARGETS[resolved_target_id] = {
|
|
"id": resolved_target_id,
|
|
"observation_id": req.observation_id,
|
|
"x": x,
|
|
"y": y,
|
|
"source": "ocr",
|
|
"match": chosen,
|
|
}
|
|
|
|
return _ok(
|
|
{
|
|
"resolved_target_id": resolved_target_id,
|
|
"source": "ocr",
|
|
"localization_confidence": _localization_confidence("ocr", chosen.get("confidence")),
|
|
"pixel": {"x": x, "y": y},
|
|
"selected_match": chosen,
|
|
"match_count": len(matches),
|
|
}
|
|
)
|
|
|
|
|
|
@app.post("/v2/act")
|
|
def act_v2(req: ActRequestV2, screen: int = 0, _: None = Depends(_auth)):
|
|
legacy_action = _resolve_v2_action(req.action)
|
|
result = _exec_action(legacy_action, screen)
|
|
return _ok(result)
|
|
|
|
|
|
@app.post("/v2/act-verify")
|
|
def act_verify_v2(req: ActVerifyRequestV2, screen: int = 0, _: None = Depends(_auth)):
|
|
defaults = _risk_defaults(req.risk_level)
|
|
verify_req = VerifyActionRequest(
|
|
action=_resolve_v2_action(req.action),
|
|
condition=req.condition,
|
|
retries=defaults["retries"] if req.retries is None else req.retries,
|
|
timeout_ms=defaults["timeout_ms"] if req.timeout_ms is None else req.timeout_ms,
|
|
poll_interval_ms=defaults["poll_interval_ms"] if req.poll_interval_ms is None else req.poll_interval_ms,
|
|
retry_delay_ms=defaults["retry_delay_ms"] if req.retry_delay_ms is None else req.retry_delay_ms,
|
|
stop_on_action_error=req.stop_on_action_error,
|
|
)
|
|
result = _run_verified_action(verify_req, screen)
|
|
payload = {
|
|
"risk_level": req.risk_level,
|
|
"defaults_applied": defaults,
|
|
**result,
|
|
}
|
|
if result.get("success", False):
|
|
return _ok(payload)
|
|
return _err("verification_failed", "action verification did not satisfy condition", 409, payload)
|
|
|
|
|
|
@app.get("/health")
|
|
def health(_: None = Depends(_auth)):
|
|
return _ok(
|
|
{
|
|
"service": "clickthrough",
|
|
"version": app.version,
|
|
"dry_run": SETTINGS["dry_run"],
|
|
"allowed_region": SETTINGS["allowed_region"],
|
|
"exec": {
|
|
"enabled": SETTINGS["exec_enabled"],
|
|
"secret_configured": bool(SETTINGS["exec_secret"]),
|
|
"default_shell": SETTINGS["exec_default_shell"],
|
|
"default_timeout_s": SETTINGS["exec_default_timeout_s"],
|
|
"max_timeout_s": SETTINGS["exec_max_timeout_s"],
|
|
},
|
|
}
|
|
)
|
|
|
|
|
|
@app.get("/displays")
|
|
def displays(_: None = Depends(_auth)):
|
|
detected = _get_displays()
|
|
return _ok({"displays": detected, "default_screen": 0})
|
|
|
|
|
|
@app.post("/exec")
|
|
def exec_command(
|
|
req: ExecRequest,
|
|
x_clickthrough_exec_secret: Optional[str] = Header(default=None),
|
|
_: None = Depends(_auth),
|
|
):
|
|
expected = SETTINGS["exec_secret"]
|
|
if not expected:
|
|
raise HTTPException(status_code=403, detail="exec secret not configured")
|
|
if not x_clickthrough_exec_secret or not hmac.compare_digest(x_clickthrough_exec_secret, expected):
|
|
raise HTTPException(status_code=401, detail="invalid exec secret")
|
|
|
|
result = _exec_command(req)
|
|
return _ok(result)
|
|
|
|
|
|
@app.get("/windows")
|
|
def windows(
|
|
title_contains: str | None = None,
|
|
title_regex: str | None = None,
|
|
process_name: str | None = None,
|
|
hwnd: int | None = None,
|
|
visible_only: bool = True,
|
|
_: None = Depends(_auth),
|
|
):
|
|
query = WindowQuery(
|
|
title_contains=title_contains,
|
|
title_regex=title_regex,
|
|
process_name=process_name,
|
|
hwnd=hwnd,
|
|
visible_only=visible_only,
|
|
)
|
|
matches = _list_windows(query)
|
|
return _ok({"windows": matches, "count": len(matches)})
|
|
|
|
|
|
@app.post("/windows/action")
|
|
def window_action(req: WindowActionRequest, _: None = Depends(_auth)):
|
|
result = _apply_window_action(req)
|
|
return _ok(result)
|
|
|
|
|
|
@app.post("/launch")
|
|
def launch(req: LaunchRequest, _: None = Depends(_auth)):
|
|
result = _launch_app(req)
|
|
return _ok(result)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
|
|
uvicorn.run("server.app:app", host=SETTINGS["host"], port=SETTINGS["port"], reload=False)
|