feat: (literally) "enhance" functionality with new parameters and improved image processing

2026-05-27 22:14:32 +02:00
parent 880468ef02
commit cceed18cf1
5 changed files with 175 additions and 21 deletions
--- a/src/agent.py
+++ b/src/agent.py
@@ -9,7 +9,7 @@ import traceback
 from typing import Any, Callable

 from openai import OpenAI
-from PIL import Image, ImageEnhance, ImageFilter, ImageOps
+from PIL import Image, ImageDraw, ImageEnhance, ImageFilter, ImageOps

 from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary
 from .pricing import estimate_cost_usd
@@ -34,7 +34,8 @@ Rules:
   - launching apps or running terminal checks
 3) For UI tasks, inspect with see_screen before clicking/typing.
 4) Coordinates are absolute screen pixels (x, y) from top-left.
-5) Use enhance(coordinate) when text/UI is unclear.
+5) Use enhance before risky clicks: small buttons/icons, dense UI, or when target confidence is below high.
+5a) For tiny controls use enhance(coordinate, region="small", mode="ui"). For tiny text use mode="text".
 6) For keyboard-heavy interactions, prefer press_key for special keys.
 6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls.
 7) You may call multiple tools in one step. If needed, do click then sleep.
@@ -195,7 +196,10 @@ class ScreenJobAgent:
            {
                "type": "function",
                "name": "enhance",
-                "description": "Create enhanced zoom around a coordinate for readability.",
+                "description": (
+                    "Create enhanced zoom around a coordinate for readability and precise targeting. "
+                    "Prefer this before clicking tiny or ambiguous UI targets."
+                ),
                "parameters": {
                    "type": "object",
                    "properties": {
@@ -207,7 +211,19 @@ class ScreenJobAgent:
                            },
                            "required": ["x", "y"],
                            "additionalProperties": False,
-                        }
+                        },
+                        "region": {
+                            "type": "string",
+                            "enum": ["small", "medium", "large"],
+                        },
+                        "mode": {
+                            "type": "string",
+                            "enum": ["ui", "text"],
+                        },
+                        "scale": {
+                            "type": ["integer", "string"],
+                            "description": "Zoom factor from 2 to 6. Defaults by region.",
+                        },
                    },
                    "required": ["coordinate"],
                    "additionalProperties": False,
@@ -355,6 +371,23 @@ class ScreenJobAgent:
            sec = max_seconds
        return sec

+    def _parse_int(self, value: Any, default: int = 0) -> int:
+        if value is None:
+            return default
+        if isinstance(value, bool):
+            return int(value)
+        if isinstance(value, int):
+            return value
+        if isinstance(value, float):
+            return int(round(value))
+        text = str(value).strip()
+        if not text:
+            return default
+        try:
+            return int(float(text))
+        except Exception:  # noqa: BLE001
+            return default
+
    def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]:
        image, meta = self._capture_screen(with_grid=True)
        out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png"
@@ -372,34 +405,106 @@ class ScreenJobAgent:

    def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
        coord = args.get("coordinate") or {}
-        x = int(coord.get("x", 0))
-        y = int(coord.get("y", 0))
+        requested_x = self._parse_int(coord.get("x", 0), default=0)
+        requested_y = self._parse_int(coord.get("y", 0), default=0)
+        region = str(args.get("region", "small") or "small").strip().lower()
+        mode = str(args.get("mode", "ui") or "ui").strip().lower()
+        if region not in {"small", "medium", "large"}:
+            region = "small"
+        if mode not in {"ui", "text"}:
+            mode = "ui"
+
+        region_half_by_preset = {
+            "small": 96,
+            "medium": 160,
+            "large": 240,
+        }
+        default_scale_by_region = {
+            "small": 4,
+            "medium": 3,
+            "large": 2,
+        }
+        raw_scale = self._parse_int(args.get("scale"), default=0)
+        scale = raw_scale if raw_scale > 0 else default_scale_by_region[region]
+        scale = clamp(scale, 2, 6)
+
        base, base_meta = self._capture_screen(with_grid=False)
        width, height = base.size

-        region_half = 180
-        left = clamp(x - region_half, 0, width - 1)
-        top = clamp(y - region_half, 0, height - 1)
-        right = clamp(x + region_half, left + 1, width)
-        bottom = clamp(y + region_half, top + 1, height)
+        source_x = clamp(requested_x, 0, max(0, width - 1))
+        source_y = clamp(requested_y, 0, max(0, height - 1))
+        region_half = region_half_by_preset[region]
+        left = clamp(source_x - region_half, 0, width - 1)
+        top = clamp(source_y - region_half, 0, height - 1)
+        right = clamp(source_x + region_half, left + 1, width)
+        bottom = clamp(source_y + region_half, top + 1, height)

        crop = base.crop((left, top, right, bottom))
-        upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC)
-        enhanced = ImageOps.autocontrast(upscaled)
-        enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0)
-        enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
-        enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2))
+        out_w = max(2, crop.width * scale)
+        out_h = max(2, crop.height * scale)
+        upscaled = crop.resize((out_w, out_h), Image.Resampling.LANCZOS)

-        out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png"
+        if mode == "text":
+            text_view = ImageOps.grayscale(upscaled)
+            text_view = ImageOps.autocontrast(text_view, cutoff=1)
+            text_view = ImageOps.equalize(text_view)
+            text_view = ImageEnhance.Contrast(text_view).enhance(1.35)
+            text_view = ImageEnhance.Sharpness(text_view).enhance(2.1)
+            processed = text_view.filter(ImageFilter.UnsharpMask(radius=1.2, percent=160, threshold=1)).convert("RGB")
+        else:
+            ui_view = ImageOps.autocontrast(upscaled, cutoff=1)
+            ui_view = ImageEnhance.Contrast(ui_view).enhance(1.2)
+            ui_view = ImageEnhance.Sharpness(ui_view).enhance(1.8)
+            processed = ui_view.filter(ImageFilter.UnsharpMask(radius=1.4, percent=150, threshold=2)).convert("RGB")
+
+        edges = upscaled.convert("L").filter(ImageFilter.FIND_EDGES)
+        edges = ImageOps.autocontrast(edges, cutoff=4)
+        edge_overlay = ImageOps.colorize(edges, black=(0, 0, 0), white=(60, 220, 255))
+        enhanced = Image.blend(processed, edge_overlay, alpha=0.18)
+
+        cx = clamp((source_x - left) * scale, 0, max(0, enhanced.width - 1))
+        cy = clamp((source_y - top) * scale, 0, max(0, enhanced.height - 1))
+        draw = ImageDraw.Draw(enhanced)
+        draw.rectangle([0, 0, enhanced.width - 1, enhanced.height - 1], outline=(255, 80, 80), width=2)
+        ring_radius = max(10, int(6 * scale / 2))
+        arm_len = max(14, int(9 * scale / 2))
+        gap = max(4, int(2 * scale / 2))
+        line_width = max(2, int(scale / 2))
+        draw.ellipse(
+            [cx - ring_radius, cy - ring_radius, cx + ring_radius, cy + ring_radius],
+            outline=(255, 80, 80),
+            width=line_width,
+        )
+        draw.line([(max(0, cx - arm_len), cy), (max(0, cx - gap), cy)], fill=(255, 80, 80), width=line_width)
+        draw.line(
+            [(min(enhanced.width - 1, cx + gap), cy), (min(enhanced.width - 1, cx + arm_len), cy)],
+            fill=(255, 80, 80),
+            width=line_width,
+        )
+        draw.line([(cx, max(0, cy - arm_len)), (cx, max(0, cy - gap))], fill=(255, 80, 80), width=line_width)
+        draw.line(
+            [(cx, min(enhanced.height - 1, cy + gap)), (cx, min(enhanced.height - 1, cy + arm_len))],
+            fill=(255, 80, 80),
+            width=line_width,
+        )
+
+        out_path = self.artifacts.enhance_dir / (
+            f"enhance_step_{self.step:03d}_{source_x}_{source_y}_{region}_{mode}_x{scale}.png"
+        )
        self._save_image(enhanced, out_path)
        data_url = image_to_data_url(enhanced, "PNG")

        meta = {
            "captured_at": utc_now_iso(),
-            "source_coord": {"x": x, "y": y},
+            "requested_coord": {"x": requested_x, "y": requested_y},
+            "source_coord": {"x": source_x, "y": source_y},
            "source_box": {"left": left, "top": top, "right": right, "bottom": bottom},
-            "scale": 2,
+            "region": region,
+            "mode": mode,
+            "scale": scale,
            "path": str(out_path.resolve()),
+            "size": {"width": enhanced.width, "height": enhanced.height},
+            "target_pixel": {"x": cx, "y": cy},
            "screen_size": {"width": width, "height": height},
            "base_capture_meta": base_meta,
        }
@@ -748,6 +853,8 @@ class ScreenJobAgent:
                            f"JOB: {job}\n"
                            "You are in an action loop. Prefer execute_command for deterministic actions. "
                            "For modifier shortcuts, use a single press_key combo (example: win+r). "
+                            "Before clicking tiny buttons/icons or dense UI areas, call enhance first "
+                            "(use region='small'; use mode='text' for tiny text labels). "
                            "You can return multiple tool calls in one step (example: click then sleep). "
                            "When done call task_complete(return=..., data=...). "
                            "Before task_complete, verify the screen content is what was expected "
@@ -817,6 +924,8 @@ class ScreenJobAgent:
                                "text": (
                                    "No function call was returned. Continue by using tools. "
                                    "Use one press_key call for key combos like win+r. "
+                                    "Prefer enhance before clicking small/unclear targets "
+                                    "(region='small', mode='ui' or 'text'). "
                                    "You may call multiple tools in one step. "
                                    "Before task_complete, verify expected screen content with see_screen/enhance "
                                    "and include observed_result in data. "