feat(vision): add screenshot diff and stability helpers
All checks were successful
python-syntax / syntax-check (push) Successful in 9s

This commit is contained in:
2026-05-01 16:24:46 +02:00
parent f00c525721
commit 02bf069425
4 changed files with 206 additions and 0 deletions

View File

@@ -246,6 +246,38 @@ class OCRFindRequest(OCRRequest):
max_results: int = Field(default=20, ge=1, le=200)
class VisionDiffRequest(BaseModel):
mode: Literal["screen", "region", "image"] = "screen"
region_x: int | None = Field(default=None, ge=0)
region_y: int | None = Field(default=None, ge=0)
region_width: int | None = Field(default=None, gt=0)
region_height: int | None = Field(default=None, gt=0)
before_image_base64: str | None = None
after_image_base64: str | None = None
delay_ms: int = Field(default=300, ge=0, le=60000)
diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)
@model_validator(mode="after")
def _validate_inputs(self):
if self.mode == "region":
required = [self.region_x, self.region_y, self.region_width, self.region_height]
if any(v is None for v in required):
raise ValueError("region_x, region_y, region_width, region_height are required for mode=region")
if self.mode == "image" and (not self.before_image_base64 or not self.after_image_base64):
raise ValueError("before_image_base64 and after_image_base64 are required for mode=image")
return self
class VisionStabilityRequest(BaseModel):
region_x: int | None = Field(default=None, ge=0)
region_y: int | None = Field(default=None, ge=0)
region_width: int | None = Field(default=None, gt=0)
region_height: int | None = Field(default=None, gt=0)
sample_interval_ms: int = Field(default=250, ge=50, le=10000)
duration_ms: int = Field(default=1200, ge=0, le=120000)
diff_threshold: float = Field(default=0.01, ge=0.0, le=1.0)
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
token = SETTINGS["token"]
@@ -898,6 +930,91 @@ def _find_text_matches(blocks: list[dict], query: str, match_mode: str, group_li
return matches[:max_results]
def _compute_visual_diff(req: VisionDiffRequest, screen: int = 0) -> dict:
if req.mode == "image":
before = _decode_image_base64(req.before_image_base64 or "")
after = _decode_image_base64(req.after_image_base64 or "")
if before.size != after.size:
raise HTTPException(status_code=400, detail="before and after images must have matching dimensions")
diff_ratio = _image_diff_ratio(before, after)
return {
"mode": req.mode,
"region": {"x": 0, "y": 0, "width": before.size[0], "height": before.size[1]},
"diff_ratio": diff_ratio,
"changed": diff_ratio >= req.diff_threshold,
"diff_threshold": req.diff_threshold,
}
before, region, mon, displays, screen_selection = _capture_region_image(
screen,
req.region_x,
req.region_y,
req.region_width,
req.region_height,
)
if req.delay_ms > 0:
time.sleep(req.delay_ms / 1000.0)
after, _, _, _, _ = _capture_region_image(
screen,
region["x"],
region["y"],
region["width"],
region["height"],
)
diff_ratio = _image_diff_ratio(before, after)
return {
"mode": req.mode,
"region": region,
"diff_ratio": diff_ratio,
"changed": diff_ratio >= req.diff_threshold,
"diff_threshold": req.diff_threshold,
"screen": screen_selection,
"display": mon,
"delay_ms": req.delay_ms,
}
def _measure_stability(req: VisionStabilityRequest, screen: int = 0) -> dict:
baseline, region, mon, displays, screen_selection = _capture_region_image(
screen,
req.region_x,
req.region_y,
req.region_width,
req.region_height,
)
sample_count = 0
max_diff_ratio = 0.0
diffs = []
deadline = time.time() + (req.duration_ms / 1000.0)
while time.time() < deadline:
time.sleep(req.sample_interval_ms / 1000.0)
current, _, _, _, _ = _capture_region_image(
screen,
region["x"],
region["y"],
region["width"],
region["height"],
)
diff_ratio = _image_diff_ratio(baseline, current)
diffs.append(diff_ratio)
max_diff_ratio = max(max_diff_ratio, diff_ratio)
sample_count += 1
baseline = current
return {
"stable": max_diff_ratio <= req.diff_threshold,
"region": region,
"sample_count": sample_count,
"max_diff_ratio": max_diff_ratio,
"avg_diff_ratio": round(sum(diffs) / len(diffs), 6) if diffs else 0.0,
"diff_threshold": req.diff_threshold,
"duration_ms": req.duration_ms,
"sample_interval_ms": req.sample_interval_ms,
"screen": screen_selection,
"display": mon,
}
def _wait_for_condition(req: WaitRequest, screen: int = 0) -> dict:
condition = req.condition
deadline = time.time() + (req.timeout_ms / 1000.0)
@@ -1433,6 +1550,28 @@ def wait(req: WaitRequest, screen: int = 0, _: None = Depends(_auth)):
}
@app.post("/vision/diff")
def vision_diff(req: VisionDiffRequest, screen: int = 0, _: None = Depends(_auth)):
result = _compute_visual_diff(req, screen)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/vision/stability")
def vision_stability(req: VisionStabilityRequest, screen: int = 0, _: None = Depends(_auth)):
result = _measure_stability(req, screen)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/ocr")
def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)):
image, region, mon, displays, screen_selection, source = _capture_ocr_source(req, screen)