from typing import Literal, Optional from pydantic import BaseModel, Field, model_validator class PixelTarget(BaseModel): mode: Literal["pixel"] x: int y: int dx: int = 0 dy: int = 0 class GridTarget(BaseModel): mode: Literal["grid"] region_x: int region_y: int region_width: int = Field(gt=0) region_height: int = Field(gt=0) rows: int = Field(gt=0) cols: int = Field(gt=0) row: int = Field(ge=0) col: int = Field(ge=0) dx: float = 0.0 dy: float = 0.0 @model_validator(mode="after") def _validate_indices(self): if self.row >= self.rows or self.col >= self.cols: raise ValueError("row/col must be inside rows/cols") if not -1.0 <= self.dx <= 1.0: raise ValueError("dx must be in [-1, 1]") if not -1.0 <= self.dy <= 1.0: raise ValueError("dy must be in [-1, 1]") return self Target = PixelTarget | GridTarget class ActionRequest(BaseModel): action: Literal[ "move", "click", "right_click", "double_click", "middle_click", "scroll", "type", "hotkey", "click_text", ] target: Optional[Target] = None duration_ms: int = Field(default=0, ge=0, le=20000) button: Literal["left", "right", "middle"] = "left" clicks: int = Field(default=1, ge=1, le=10) scroll_amount: int = 0 text: str = "" keys: list[str] = Field(default_factory=list) interval_ms: int = Field(default=20, ge=0, le=5000) dry_run: bool = False click_text: "ClickTextAction | None" = None @model_validator(mode="after") def _validate_click_text(self): if self.action == "click_text" and self.click_text is None: raise ValueError("click_text payload is required when action=click_text") return self class ExecRequest(BaseModel): command: str = Field(min_length=1, max_length=10000) shell: Literal["powershell", "bash", "cmd"] | None = None timeout_s: int | None = Field(default=None, ge=1, le=600) cwd: str | None = None dry_run: bool = False class WindowQuery(BaseModel): title_contains: str | None = Field(default=None, max_length=512) title_regex: str | None = Field(default=None, max_length=512) process_name: str | None = Field(default=None, max_length=260) hwnd: int | None = Field(default=None, ge=1) visible_only: bool = True class WindowActionRequest(WindowQuery): action: Literal["focus", "restore", "minimize", "maximize", "close"] timeout_ms: int = Field(default=3000, ge=0, le=60000) class LaunchRequest(BaseModel): executable: str = Field(min_length=1, max_length=2048) args: list[str] = Field(default_factory=list, max_length=100) cwd: str | None = None wait_for_window: bool = False match: WindowQuery | None = None timeout_ms: int = Field(default=5000, ge=0, le=120000) dry_run: bool = False class SeeRequest(BaseModel): screen: int = 0 region_x: int | None = Field(default=None, ge=0) region_y: int | None = Field(default=None, ge=0) region_width: int | None = Field(default=None, gt=0) region_height: int | None = Field(default=None, gt=0) with_grid: bool = True grid_rows: int = Field(default=12, ge=1, le=300) grid_cols: int = Field(default=12, ge=1, le=300) include_labels: bool = True image_format: Literal["png", "jpeg"] = "png" jpeg_quality: int = Field(default=85, ge=1, le=100) ocr: bool = False ocr_min_confidence: float = Field(default=0.0, ge=0.0, le=100.0) ocr_lang: str = Field(default="eng", min_length=1, max_length=64) ocr_psm: int | None = Field(default=None, ge=0, le=13) class SeeZoomRequest(BaseModel): screen: int = 0 center_x: int = Field(ge=0) center_y: int = Field(ge=0) width: int = Field(default=500, ge=10) height: int = Field(default=350, ge=10) with_grid: bool = True grid_rows: int = Field(default=20, ge=1, le=300) grid_cols: int = Field(default=20, ge=1, le=300) include_labels: bool = True image_format: Literal["png", "jpeg"] = "png" jpeg_quality: int = Field(default=90, ge=1, le=100) class InteractRequest(BaseModel): screen: int = 0 action: ActionRequest class OCRRegion(BaseModel): x: int = Field(ge=0) y: int = Field(ge=0) width: int = Field(gt=0) height: int = Field(gt=0) class ClickTextAction(BaseModel): text: str = Field(min_length=1, max_length=1000) match: Literal["contains", "exact", "regex"] = "contains" region: OCRRegion | None = None screen: int | None = None case_sensitive: bool = False min_confidence: float = Field(default=0.0, ge=0.0, le=100.0) occurrence: Literal["first", "best", "nth"] = "first" nth: int | None = Field(default=None, ge=1, le=10000) ocr_lang: str = Field(default="eng", min_length=1, max_length=64) ocr_psm: int | None = Field(default=None, ge=0, le=13) @model_validator(mode="after") def _validate_nth(self): if self.occurrence == "nth" and self.nth is None: raise ValueError("nth is required when occurrence=nth") if self.occurrence != "nth" and self.nth is not None: raise ValueError("nth is only allowed when occurrence=nth") return self class VerifyOCRTextNearPoint(BaseModel): type: Literal["ocr_text_near_point"] text: str = Field(min_length=1, max_length=1000) x: int = Field(ge=0) y: int = Field(ge=0) radius: int = Field(default=80, ge=1, le=1000) screen: int = 0 match: Literal["contains", "exact", "regex"] = "contains" case_sensitive: bool = False min_confidence: float = Field(default=0.0, ge=0.0, le=100.0) ocr_lang: str = Field(default="eng", min_length=1, max_length=64) ocr_psm: int | None = Field(default=None, ge=0, le=13) class InteractVerifyRequest(BaseModel): action: InteractRequest verify: VerifyOCRTextNearPoint check_interval_ms: int = Field(default=250, ge=50, le=5000) timeout_ms: int = Field(default=3000, ge=100, le=60000) ActionRequest.model_rebuild()