feat: (literally) "enhance" functionality with new parameters and improved image processing
All checks were successful
CI / test (push) Successful in 7s

This commit is contained in:
Space-Banane
2026-05-27 22:14:32 +02:00
parent 880468ef02
commit cceed18cf1
5 changed files with 175 additions and 21 deletions

View File

@@ -162,7 +162,9 @@ Each job payload includes:
- Prefer `execute_command` for deterministic actions (opening URLs, filesystem checks). - Prefer `execute_command` for deterministic actions (opening URLs, filesystem checks).
- Use `see_screen` before UI interaction. - Use `see_screen` before UI interaction.
- Use `enhance` when text is unclear. - Use `enhance` before clicking small/ambiguous targets; prefer `region="small"` for compact controls.
- Use `enhance` `mode="text"` for tiny labels/text, or `mode="ui"` for general UI.
- Optionally set `enhance` `scale` (2-6) for tighter zoom control.
- Use `press_key` for non-text keys (Enter, Tab, arrows, Escape). - Use `press_key` for non-text keys (Enter, Tab, arrows, Escape).
- For shortcuts, use one `press_key` call with combo syntax (example: `win+r`). - For shortcuts, use one `press_key` call with combo syntax (example: `win+r`).
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`. - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.

View File

@@ -37,6 +37,14 @@ Keyboard combo rule:
- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`. - For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
- Do not split modifier combos into separate calls. - Do not split modifier combos into separate calls.
Enhance-first click rule:
- Before clicking small buttons/icons, dense UI, or ambiguous targets, call `enhance` first.
- Preferred preset for tiny controls: `enhance(coordinate, region="small", mode="ui")`.
- For tiny labels/text: use `mode="text"` to improve readability.
- Optional zoom control: set `scale` from `2` to `6` (defaults are tuned by region).
- After checking the enhanced image, click using the same target coordinate (or a small directional offset if needed).
Verification rule: Verification rule:
- Before `task_complete`, verify actual on-screen content matches the expected outcome. - Before `task_complete`, verify actual on-screen content matches the expected outcome.

View File

@@ -9,7 +9,7 @@ import traceback
from typing import Any, Callable from typing import Any, Callable
from openai import OpenAI from openai import OpenAI
from PIL import Image, ImageEnhance, ImageFilter, ImageOps from PIL import Image, ImageDraw, ImageEnhance, ImageFilter, ImageOps
from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary
from .pricing import estimate_cost_usd from .pricing import estimate_cost_usd
@@ -34,7 +34,8 @@ Rules:
- launching apps or running terminal checks - launching apps or running terminal checks
3) For UI tasks, inspect with see_screen before clicking/typing. 3) For UI tasks, inspect with see_screen before clicking/typing.
4) Coordinates are absolute screen pixels (x, y) from top-left. 4) Coordinates are absolute screen pixels (x, y) from top-left.
5) Use enhance(coordinate) when text/UI is unclear. 5) Use enhance before risky clicks: small buttons/icons, dense UI, or when target confidence is below high.
5a) For tiny controls use enhance(coordinate, region="small", mode="ui"). For tiny text use mode="text".
6) For keyboard-heavy interactions, prefer press_key for special keys. 6) For keyboard-heavy interactions, prefer press_key for special keys.
6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls. 6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls.
7) You may call multiple tools in one step. If needed, do click then sleep. 7) You may call multiple tools in one step. If needed, do click then sleep.
@@ -195,7 +196,10 @@ class ScreenJobAgent:
{ {
"type": "function", "type": "function",
"name": "enhance", "name": "enhance",
"description": "Create enhanced zoom around a coordinate for readability.", "description": (
"Create enhanced zoom around a coordinate for readability and precise targeting. "
"Prefer this before clicking tiny or ambiguous UI targets."
),
"parameters": { "parameters": {
"type": "object", "type": "object",
"properties": { "properties": {
@@ -207,7 +211,19 @@ class ScreenJobAgent:
}, },
"required": ["x", "y"], "required": ["x", "y"],
"additionalProperties": False, "additionalProperties": False,
} },
"region": {
"type": "string",
"enum": ["small", "medium", "large"],
},
"mode": {
"type": "string",
"enum": ["ui", "text"],
},
"scale": {
"type": ["integer", "string"],
"description": "Zoom factor from 2 to 6. Defaults by region.",
},
}, },
"required": ["coordinate"], "required": ["coordinate"],
"additionalProperties": False, "additionalProperties": False,
@@ -355,6 +371,23 @@ class ScreenJobAgent:
sec = max_seconds sec = max_seconds
return sec return sec
def _parse_int(self, value: Any, default: int = 0) -> int:
if value is None:
return default
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(round(value))
text = str(value).strip()
if not text:
return default
try:
return int(float(text))
except Exception: # noqa: BLE001
return default
def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]: def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]:
image, meta = self._capture_screen(with_grid=True) image, meta = self._capture_screen(with_grid=True)
out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png" out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png"
@@ -372,34 +405,106 @@ class ScreenJobAgent:
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]: def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
coord = args.get("coordinate") or {} coord = args.get("coordinate") or {}
x = int(coord.get("x", 0)) requested_x = self._parse_int(coord.get("x", 0), default=0)
y = int(coord.get("y", 0)) requested_y = self._parse_int(coord.get("y", 0), default=0)
region = str(args.get("region", "small") or "small").strip().lower()
mode = str(args.get("mode", "ui") or "ui").strip().lower()
if region not in {"small", "medium", "large"}:
region = "small"
if mode not in {"ui", "text"}:
mode = "ui"
region_half_by_preset = {
"small": 96,
"medium": 160,
"large": 240,
}
default_scale_by_region = {
"small": 4,
"medium": 3,
"large": 2,
}
raw_scale = self._parse_int(args.get("scale"), default=0)
scale = raw_scale if raw_scale > 0 else default_scale_by_region[region]
scale = clamp(scale, 2, 6)
base, base_meta = self._capture_screen(with_grid=False) base, base_meta = self._capture_screen(with_grid=False)
width, height = base.size width, height = base.size
region_half = 180 source_x = clamp(requested_x, 0, max(0, width - 1))
left = clamp(x - region_half, 0, width - 1) source_y = clamp(requested_y, 0, max(0, height - 1))
top = clamp(y - region_half, 0, height - 1) region_half = region_half_by_preset[region]
right = clamp(x + region_half, left + 1, width) left = clamp(source_x - region_half, 0, width - 1)
bottom = clamp(y + region_half, top + 1, height) top = clamp(source_y - region_half, 0, height - 1)
right = clamp(source_x + region_half, left + 1, width)
bottom = clamp(source_y + region_half, top + 1, height)
crop = base.crop((left, top, right, bottom)) crop = base.crop((left, top, right, bottom))
upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC) out_w = max(2, crop.width * scale)
enhanced = ImageOps.autocontrast(upscaled) out_h = max(2, crop.height * scale)
enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0) upscaled = crop.resize((out_w, out_h), Image.Resampling.LANCZOS)
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2))
out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png" if mode == "text":
text_view = ImageOps.grayscale(upscaled)
text_view = ImageOps.autocontrast(text_view, cutoff=1)
text_view = ImageOps.equalize(text_view)
text_view = ImageEnhance.Contrast(text_view).enhance(1.35)
text_view = ImageEnhance.Sharpness(text_view).enhance(2.1)
processed = text_view.filter(ImageFilter.UnsharpMask(radius=1.2, percent=160, threshold=1)).convert("RGB")
else:
ui_view = ImageOps.autocontrast(upscaled, cutoff=1)
ui_view = ImageEnhance.Contrast(ui_view).enhance(1.2)
ui_view = ImageEnhance.Sharpness(ui_view).enhance(1.8)
processed = ui_view.filter(ImageFilter.UnsharpMask(radius=1.4, percent=150, threshold=2)).convert("RGB")
edges = upscaled.convert("L").filter(ImageFilter.FIND_EDGES)
edges = ImageOps.autocontrast(edges, cutoff=4)
edge_overlay = ImageOps.colorize(edges, black=(0, 0, 0), white=(60, 220, 255))
enhanced = Image.blend(processed, edge_overlay, alpha=0.18)
cx = clamp((source_x - left) * scale, 0, max(0, enhanced.width - 1))
cy = clamp((source_y - top) * scale, 0, max(0, enhanced.height - 1))
draw = ImageDraw.Draw(enhanced)
draw.rectangle([0, 0, enhanced.width - 1, enhanced.height - 1], outline=(255, 80, 80), width=2)
ring_radius = max(10, int(6 * scale / 2))
arm_len = max(14, int(9 * scale / 2))
gap = max(4, int(2 * scale / 2))
line_width = max(2, int(scale / 2))
draw.ellipse(
[cx - ring_radius, cy - ring_radius, cx + ring_radius, cy + ring_radius],
outline=(255, 80, 80),
width=line_width,
)
draw.line([(max(0, cx - arm_len), cy), (max(0, cx - gap), cy)], fill=(255, 80, 80), width=line_width)
draw.line(
[(min(enhanced.width - 1, cx + gap), cy), (min(enhanced.width - 1, cx + arm_len), cy)],
fill=(255, 80, 80),
width=line_width,
)
draw.line([(cx, max(0, cy - arm_len)), (cx, max(0, cy - gap))], fill=(255, 80, 80), width=line_width)
draw.line(
[(cx, min(enhanced.height - 1, cy + gap)), (cx, min(enhanced.height - 1, cy + arm_len))],
fill=(255, 80, 80),
width=line_width,
)
out_path = self.artifacts.enhance_dir / (
f"enhance_step_{self.step:03d}_{source_x}_{source_y}_{region}_{mode}_x{scale}.png"
)
self._save_image(enhanced, out_path) self._save_image(enhanced, out_path)
data_url = image_to_data_url(enhanced, "PNG") data_url = image_to_data_url(enhanced, "PNG")
meta = { meta = {
"captured_at": utc_now_iso(), "captured_at": utc_now_iso(),
"source_coord": {"x": x, "y": y}, "requested_coord": {"x": requested_x, "y": requested_y},
"source_coord": {"x": source_x, "y": source_y},
"source_box": {"left": left, "top": top, "right": right, "bottom": bottom}, "source_box": {"left": left, "top": top, "right": right, "bottom": bottom},
"scale": 2, "region": region,
"mode": mode,
"scale": scale,
"path": str(out_path.resolve()), "path": str(out_path.resolve()),
"size": {"width": enhanced.width, "height": enhanced.height},
"target_pixel": {"x": cx, "y": cy},
"screen_size": {"width": width, "height": height}, "screen_size": {"width": width, "height": height},
"base_capture_meta": base_meta, "base_capture_meta": base_meta,
} }
@@ -748,6 +853,8 @@ class ScreenJobAgent:
f"JOB: {job}\n" f"JOB: {job}\n"
"You are in an action loop. Prefer execute_command for deterministic actions. " "You are in an action loop. Prefer execute_command for deterministic actions. "
"For modifier shortcuts, use a single press_key combo (example: win+r). " "For modifier shortcuts, use a single press_key combo (example: win+r). "
"Before clicking tiny buttons/icons or dense UI areas, call enhance first "
"(use region='small'; use mode='text' for tiny text labels). "
"You can return multiple tool calls in one step (example: click then sleep). " "You can return multiple tool calls in one step (example: click then sleep). "
"When done call task_complete(return=..., data=...). " "When done call task_complete(return=..., data=...). "
"Before task_complete, verify the screen content is what was expected " "Before task_complete, verify the screen content is what was expected "
@@ -817,6 +924,8 @@ class ScreenJobAgent:
"text": ( "text": (
"No function call was returned. Continue by using tools. " "No function call was returned. Continue by using tools. "
"Use one press_key call for key combos like win+r. " "Use one press_key call for key combos like win+r. "
"Prefer enhance before clicking small/unclear targets "
"(region='small', mode='ui' or 'text'). "
"You may call multiple tools in one step. " "You may call multiple tools in one step. "
"Before task_complete, verify expected screen content with see_screen/enhance " "Before task_complete, verify expected screen content with see_screen/enhance "
"and include observed_result in data. " "and include observed_result in data. "

View File

@@ -91,6 +91,41 @@ def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None
assert click_result["clicked"] == {"x": 110, "y": 102} assert click_result["clicked"] == {"x": 110, "y": 102}
def test_enhance_defaults_to_small_ui_preset(tmp_path: Path, monkeypatch) -> None:
agent = _build_agent(tmp_path, monkeypatch)
result = agent._tool_enhance({"coordinate": {"x": 100, "y": 120}})
assert result["ok"] is True
meta = result["meta"]
assert meta["region"] == "small"
assert meta["mode"] == "ui"
assert meta["scale"] == 4
assert Path(meta["path"]).exists()
assert meta["target_pixel"]["x"] >= 0
assert meta["target_pixel"]["y"] >= 0
def test_enhance_supports_text_mode_and_scale_clamp(tmp_path: Path, monkeypatch) -> None:
agent = _build_agent(tmp_path, monkeypatch)
result = agent._tool_enhance(
{
"coordinate": {"x": -99, "y": 9999},
"region": "medium",
"mode": "text",
"scale": 99,
}
)
assert result["ok"] is True
meta = result["meta"]
assert meta["region"] == "medium"
assert meta["mode"] == "text"
assert meta["scale"] == 6
assert meta["requested_coord"] == {"x": -99, "y": 9999}
assert meta["source_coord"] == {"x": 0, "y": 719}
assert Path(meta["path"]).exists()
def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None: def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None:
agent = _build_agent(tmp_path, monkeypatch) agent = _build_agent(tmp_path, monkeypatch)
result = agent._tool_press_key({"key": "meta+r"}) result = agent._tool_press_key({"key": "meta+r"})

View File

@@ -4,7 +4,7 @@
- [Bug] Enforce single active desktop-control run (or a strict queue) so concurrent jobs cannot fight over the same mouse/keyboard/screen session. - [Bug] Enforce single active desktop-control run (or a strict queue) so concurrent jobs cannot fight over the same mouse/keyboard/screen session.
- [Bug] Fix run artifact collisions in `setup_artifacts()` (`run_id` is second-granularity, so two jobs in the same second can share/overwrite the same directory). - [Bug] Fix run artifact collisions in `setup_artifacts()` (`run_id` is second-granularity, so two jobs in the same second can share/overwrite the same directory).
- [Bug] Remove global logger handler clobbering in `setup_logger()` (`logging.getLogger("screenjob").handlers.clear()` breaks concurrent runs and can redirect logs to the wrong file). - [Bug] Remove global logger handler clobbering in `setup_logger()` (`logging.getLogger("screenjob").handlers.clear()` breaks concurrent runs and can redirect logs to the wrong file).
- [Bug] More consistent clicks and more uses of enhance images. - [x] More consistent clicks and more uses of enhance images.
## P1 ## P1
- [x] Move ui.py into a seperate html file and js file. - [x] Move ui.py into a seperate html file and js file.