From cceed18cf1022725022bc7d8fcf34870401c8dac Mon Sep 17 00:00:00 2001 From: Space-Banane Date: Wed, 27 May 2026 22:14:32 +0200 Subject: [PATCH] feat: (literally) "enhance" functionality with new parameters and improved image processing --- README.md | 4 +- SKILL.md | 8 +++ src/agent.py | 147 +++++++++++++++++++++++++++++++++----- tests/test_agent_tools.py | 35 +++++++++ todo.md | 2 +- 5 files changed, 175 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 259dac8..97ad90c 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,9 @@ Each job payload includes: - Prefer `execute_command` for deterministic actions (opening URLs, filesystem checks). - Use `see_screen` before UI interaction. -- Use `enhance` when text is unclear. +- Use `enhance` before clicking small/ambiguous targets; prefer `region="small"` for compact controls. +- Use `enhance` `mode="text"` for tiny labels/text, or `mode="ui"` for general UI. +- Optionally set `enhance` `scale` (2-6) for tighter zoom control. - Use `press_key` for non-text keys (Enter, Tab, arrows, Escape). - For shortcuts, use one `press_key` call with combo syntax (example: `win+r`). - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`. diff --git a/SKILL.md b/SKILL.md index ea9ac46..4423aaa 100644 --- a/SKILL.md +++ b/SKILL.md @@ -37,6 +37,14 @@ Keyboard combo rule: - For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`. - Do not split modifier combos into separate calls. +Enhance-first click rule: + +- Before clicking small buttons/icons, dense UI, or ambiguous targets, call `enhance` first. +- Preferred preset for tiny controls: `enhance(coordinate, region="small", mode="ui")`. +- For tiny labels/text: use `mode="text"` to improve readability. +- Optional zoom control: set `scale` from `2` to `6` (defaults are tuned by region). +- After checking the enhanced image, click using the same target coordinate (or a small directional offset if needed). + Verification rule: - Before `task_complete`, verify actual on-screen content matches the expected outcome. diff --git a/src/agent.py b/src/agent.py index 20fc2f4..3f4aabf 100644 --- a/src/agent.py +++ b/src/agent.py @@ -9,7 +9,7 @@ import traceback from typing import Any, Callable from openai import OpenAI -from PIL import Image, ImageEnhance, ImageFilter, ImageOps +from PIL import Image, ImageDraw, ImageEnhance, ImageFilter, ImageOps from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary from .pricing import estimate_cost_usd @@ -34,7 +34,8 @@ Rules: - launching apps or running terminal checks 3) For UI tasks, inspect with see_screen before clicking/typing. 4) Coordinates are absolute screen pixels (x, y) from top-left. -5) Use enhance(coordinate) when text/UI is unclear. +5) Use enhance before risky clicks: small buttons/icons, dense UI, or when target confidence is below high. +5a) For tiny controls use enhance(coordinate, region="small", mode="ui"). For tiny text use mode="text". 6) For keyboard-heavy interactions, prefer press_key for special keys. 6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls. 7) You may call multiple tools in one step. If needed, do click then sleep. @@ -195,7 +196,10 @@ class ScreenJobAgent: { "type": "function", "name": "enhance", - "description": "Create enhanced zoom around a coordinate for readability.", + "description": ( + "Create enhanced zoom around a coordinate for readability and precise targeting. " + "Prefer this before clicking tiny or ambiguous UI targets." + ), "parameters": { "type": "object", "properties": { @@ -207,7 +211,19 @@ class ScreenJobAgent: }, "required": ["x", "y"], "additionalProperties": False, - } + }, + "region": { + "type": "string", + "enum": ["small", "medium", "large"], + }, + "mode": { + "type": "string", + "enum": ["ui", "text"], + }, + "scale": { + "type": ["integer", "string"], + "description": "Zoom factor from 2 to 6. Defaults by region.", + }, }, "required": ["coordinate"], "additionalProperties": False, @@ -355,6 +371,23 @@ class ScreenJobAgent: sec = max_seconds return sec + def _parse_int(self, value: Any, default: int = 0) -> int: + if value is None: + return default + if isinstance(value, bool): + return int(value) + if isinstance(value, int): + return value + if isinstance(value, float): + return int(round(value)) + text = str(value).strip() + if not text: + return default + try: + return int(float(text)) + except Exception: # noqa: BLE001 + return default + def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]: image, meta = self._capture_screen(with_grid=True) out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png" @@ -372,34 +405,106 @@ class ScreenJobAgent: def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]: coord = args.get("coordinate") or {} - x = int(coord.get("x", 0)) - y = int(coord.get("y", 0)) + requested_x = self._parse_int(coord.get("x", 0), default=0) + requested_y = self._parse_int(coord.get("y", 0), default=0) + region = str(args.get("region", "small") or "small").strip().lower() + mode = str(args.get("mode", "ui") or "ui").strip().lower() + if region not in {"small", "medium", "large"}: + region = "small" + if mode not in {"ui", "text"}: + mode = "ui" + + region_half_by_preset = { + "small": 96, + "medium": 160, + "large": 240, + } + default_scale_by_region = { + "small": 4, + "medium": 3, + "large": 2, + } + raw_scale = self._parse_int(args.get("scale"), default=0) + scale = raw_scale if raw_scale > 0 else default_scale_by_region[region] + scale = clamp(scale, 2, 6) + base, base_meta = self._capture_screen(with_grid=False) width, height = base.size - region_half = 180 - left = clamp(x - region_half, 0, width - 1) - top = clamp(y - region_half, 0, height - 1) - right = clamp(x + region_half, left + 1, width) - bottom = clamp(y + region_half, top + 1, height) + source_x = clamp(requested_x, 0, max(0, width - 1)) + source_y = clamp(requested_y, 0, max(0, height - 1)) + region_half = region_half_by_preset[region] + left = clamp(source_x - region_half, 0, width - 1) + top = clamp(source_y - region_half, 0, height - 1) + right = clamp(source_x + region_half, left + 1, width) + bottom = clamp(source_y + region_half, top + 1, height) crop = base.crop((left, top, right, bottom)) - upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC) - enhanced = ImageOps.autocontrast(upscaled) - enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0) - enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25) - enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2)) + out_w = max(2, crop.width * scale) + out_h = max(2, crop.height * scale) + upscaled = crop.resize((out_w, out_h), Image.Resampling.LANCZOS) - out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png" + if mode == "text": + text_view = ImageOps.grayscale(upscaled) + text_view = ImageOps.autocontrast(text_view, cutoff=1) + text_view = ImageOps.equalize(text_view) + text_view = ImageEnhance.Contrast(text_view).enhance(1.35) + text_view = ImageEnhance.Sharpness(text_view).enhance(2.1) + processed = text_view.filter(ImageFilter.UnsharpMask(radius=1.2, percent=160, threshold=1)).convert("RGB") + else: + ui_view = ImageOps.autocontrast(upscaled, cutoff=1) + ui_view = ImageEnhance.Contrast(ui_view).enhance(1.2) + ui_view = ImageEnhance.Sharpness(ui_view).enhance(1.8) + processed = ui_view.filter(ImageFilter.UnsharpMask(radius=1.4, percent=150, threshold=2)).convert("RGB") + + edges = upscaled.convert("L").filter(ImageFilter.FIND_EDGES) + edges = ImageOps.autocontrast(edges, cutoff=4) + edge_overlay = ImageOps.colorize(edges, black=(0, 0, 0), white=(60, 220, 255)) + enhanced = Image.blend(processed, edge_overlay, alpha=0.18) + + cx = clamp((source_x - left) * scale, 0, max(0, enhanced.width - 1)) + cy = clamp((source_y - top) * scale, 0, max(0, enhanced.height - 1)) + draw = ImageDraw.Draw(enhanced) + draw.rectangle([0, 0, enhanced.width - 1, enhanced.height - 1], outline=(255, 80, 80), width=2) + ring_radius = max(10, int(6 * scale / 2)) + arm_len = max(14, int(9 * scale / 2)) + gap = max(4, int(2 * scale / 2)) + line_width = max(2, int(scale / 2)) + draw.ellipse( + [cx - ring_radius, cy - ring_radius, cx + ring_radius, cy + ring_radius], + outline=(255, 80, 80), + width=line_width, + ) + draw.line([(max(0, cx - arm_len), cy), (max(0, cx - gap), cy)], fill=(255, 80, 80), width=line_width) + draw.line( + [(min(enhanced.width - 1, cx + gap), cy), (min(enhanced.width - 1, cx + arm_len), cy)], + fill=(255, 80, 80), + width=line_width, + ) + draw.line([(cx, max(0, cy - arm_len)), (cx, max(0, cy - gap))], fill=(255, 80, 80), width=line_width) + draw.line( + [(cx, min(enhanced.height - 1, cy + gap)), (cx, min(enhanced.height - 1, cy + arm_len))], + fill=(255, 80, 80), + width=line_width, + ) + + out_path = self.artifacts.enhance_dir / ( + f"enhance_step_{self.step:03d}_{source_x}_{source_y}_{region}_{mode}_x{scale}.png" + ) self._save_image(enhanced, out_path) data_url = image_to_data_url(enhanced, "PNG") meta = { "captured_at": utc_now_iso(), - "source_coord": {"x": x, "y": y}, + "requested_coord": {"x": requested_x, "y": requested_y}, + "source_coord": {"x": source_x, "y": source_y}, "source_box": {"left": left, "top": top, "right": right, "bottom": bottom}, - "scale": 2, + "region": region, + "mode": mode, + "scale": scale, "path": str(out_path.resolve()), + "size": {"width": enhanced.width, "height": enhanced.height}, + "target_pixel": {"x": cx, "y": cy}, "screen_size": {"width": width, "height": height}, "base_capture_meta": base_meta, } @@ -748,6 +853,8 @@ class ScreenJobAgent: f"JOB: {job}\n" "You are in an action loop. Prefer execute_command for deterministic actions. " "For modifier shortcuts, use a single press_key combo (example: win+r). " + "Before clicking tiny buttons/icons or dense UI areas, call enhance first " + "(use region='small'; use mode='text' for tiny text labels). " "You can return multiple tool calls in one step (example: click then sleep). " "When done call task_complete(return=..., data=...). " "Before task_complete, verify the screen content is what was expected " @@ -817,6 +924,8 @@ class ScreenJobAgent: "text": ( "No function call was returned. Continue by using tools. " "Use one press_key call for key combos like win+r. " + "Prefer enhance before clicking small/unclear targets " + "(region='small', mode='ui' or 'text'). " "You may call multiple tools in one step. " "Before task_complete, verify expected screen content with see_screen/enhance " "and include observed_result in data. " diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py index 963374e..d83beaa 100644 --- a/tests/test_agent_tools.py +++ b/tests/test_agent_tools.py @@ -91,6 +91,41 @@ def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None assert click_result["clicked"] == {"x": 110, "y": 102} +def test_enhance_defaults_to_small_ui_preset(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + result = agent._tool_enhance({"coordinate": {"x": 100, "y": 120}}) + + assert result["ok"] is True + meta = result["meta"] + assert meta["region"] == "small" + assert meta["mode"] == "ui" + assert meta["scale"] == 4 + assert Path(meta["path"]).exists() + assert meta["target_pixel"]["x"] >= 0 + assert meta["target_pixel"]["y"] >= 0 + + +def test_enhance_supports_text_mode_and_scale_clamp(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + result = agent._tool_enhance( + { + "coordinate": {"x": -99, "y": 9999}, + "region": "medium", + "mode": "text", + "scale": 99, + } + ) + + assert result["ok"] is True + meta = result["meta"] + assert meta["region"] == "medium" + assert meta["mode"] == "text" + assert meta["scale"] == 6 + assert meta["requested_coord"] == {"x": -99, "y": 9999} + assert meta["source_coord"] == {"x": 0, "y": 719} + assert Path(meta["path"]).exists() + + def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None: agent = _build_agent(tmp_path, monkeypatch) result = agent._tool_press_key({"key": "meta+r"}) diff --git a/todo.md b/todo.md index 0931135..7b75d22 100644 --- a/todo.md +++ b/todo.md @@ -4,7 +4,7 @@ - [Bug] Enforce single active desktop-control run (or a strict queue) so concurrent jobs cannot fight over the same mouse/keyboard/screen session. - [Bug] Fix run artifact collisions in `setup_artifacts()` (`run_id` is second-granularity, so two jobs in the same second can share/overwrite the same directory). - [Bug] Remove global logger handler clobbering in `setup_logger()` (`logging.getLogger("screenjob").handlers.clear()` breaks concurrent runs and can redirect logs to the wrong file). -- [Bug] More consistent clicks and more uses of enhance images. +- [x] More consistent clicks and more uses of enhance images. ## P1 - [x] Move ui.py into a seperate html file and js file.