Compare commits
4 Commits
a8ef8ee552
...
cceed18cf1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cceed18cf1 | ||
|
|
880468ef02 | ||
|
|
b05a7be668 | ||
|
|
0c019474af |
@@ -162,7 +162,9 @@ Each job payload includes:
|
||||
|
||||
- Prefer `execute_command` for deterministic actions (opening URLs, filesystem checks).
|
||||
- Use `see_screen` before UI interaction.
|
||||
- Use `enhance` when text is unclear.
|
||||
- Use `enhance` before clicking small/ambiguous targets; prefer `region="small"` for compact controls.
|
||||
- Use `enhance` `mode="text"` for tiny labels/text, or `mode="ui"` for general UI.
|
||||
- Optionally set `enhance` `scale` (2-6) for tighter zoom control.
|
||||
- Use `press_key` for non-text keys (Enter, Tab, arrows, Escape).
|
||||
- For shortcuts, use one `press_key` call with combo syntax (example: `win+r`).
|
||||
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
|
||||
|
||||
8
SKILL.md
8
SKILL.md
@@ -37,6 +37,14 @@ Keyboard combo rule:
|
||||
- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
|
||||
- Do not split modifier combos into separate calls.
|
||||
|
||||
Enhance-first click rule:
|
||||
|
||||
- Before clicking small buttons/icons, dense UI, or ambiguous targets, call `enhance` first.
|
||||
- Preferred preset for tiny controls: `enhance(coordinate, region="small", mode="ui")`.
|
||||
- For tiny labels/text: use `mode="text"` to improve readability.
|
||||
- Optional zoom control: set `scale` from `2` to `6` (defaults are tuned by region).
|
||||
- After checking the enhanced image, click using the same target coordinate (or a small directional offset if needed).
|
||||
|
||||
Verification rule:
|
||||
|
||||
- Before `task_complete`, verify actual on-screen content matches the expected outcome.
|
||||
|
||||
245
src/agent.py
245
src/agent.py
@@ -9,7 +9,7 @@ import traceback
|
||||
from typing import Any, Callable
|
||||
|
||||
from openai import OpenAI
|
||||
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
|
||||
from PIL import Image, ImageDraw, ImageEnhance, ImageFilter, ImageOps
|
||||
|
||||
from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary
|
||||
from .pricing import estimate_cost_usd
|
||||
@@ -34,7 +34,8 @@ Rules:
|
||||
- launching apps or running terminal checks
|
||||
3) For UI tasks, inspect with see_screen before clicking/typing.
|
||||
4) Coordinates are absolute screen pixels (x, y) from top-left.
|
||||
5) Use enhance(coordinate) when text/UI is unclear.
|
||||
5) Use enhance before risky clicks: small buttons/icons, dense UI, or when target confidence is below high.
|
||||
5a) For tiny controls use enhance(coordinate, region="small", mode="ui"). For tiny text use mode="text".
|
||||
6) For keyboard-heavy interactions, prefer press_key for special keys.
|
||||
6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls.
|
||||
7) You may call multiple tools in one step. If needed, do click then sleep.
|
||||
@@ -76,11 +77,14 @@ class ScreenJobAgent:
|
||||
self.final_data: Any | None = None
|
||||
self.previous_response_id: str | None = None
|
||||
self.usage = UsageSummary()
|
||||
self.objective = ""
|
||||
|
||||
self.last_screen_data_url: str | None = None
|
||||
self.last_screen_meta: dict[str, Any] | None = None
|
||||
self.click_history: list[tuple[int, int, float]] = []
|
||||
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
|
||||
self.recent_tool_summaries: list[str] = []
|
||||
self.last_context_compact_step = 0
|
||||
|
||||
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
|
||||
if self.event_callback is None:
|
||||
@@ -192,7 +196,10 @@ class ScreenJobAgent:
|
||||
{
|
||||
"type": "function",
|
||||
"name": "enhance",
|
||||
"description": "Create enhanced zoom around a coordinate for readability.",
|
||||
"description": (
|
||||
"Create enhanced zoom around a coordinate for readability and precise targeting. "
|
||||
"Prefer this before clicking tiny or ambiguous UI targets."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -204,7 +211,19 @@ class ScreenJobAgent:
|
||||
},
|
||||
"required": ["x", "y"],
|
||||
"additionalProperties": False,
|
||||
}
|
||||
},
|
||||
"region": {
|
||||
"type": "string",
|
||||
"enum": ["small", "medium", "large"],
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["ui", "text"],
|
||||
},
|
||||
"scale": {
|
||||
"type": ["integer", "string"],
|
||||
"description": "Zoom factor from 2 to 6. Defaults by region.",
|
||||
},
|
||||
},
|
||||
"required": ["coordinate"],
|
||||
"additionalProperties": False,
|
||||
@@ -352,6 +371,23 @@ class ScreenJobAgent:
|
||||
sec = max_seconds
|
||||
return sec
|
||||
|
||||
def _parse_int(self, value: Any, default: int = 0) -> int:
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(round(value))
|
||||
text = str(value).strip()
|
||||
if not text:
|
||||
return default
|
||||
try:
|
||||
return int(float(text))
|
||||
except Exception: # noqa: BLE001
|
||||
return default
|
||||
|
||||
def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]:
|
||||
image, meta = self._capture_screen(with_grid=True)
|
||||
out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png"
|
||||
@@ -369,34 +405,106 @@ class ScreenJobAgent:
|
||||
|
||||
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
coord = args.get("coordinate") or {}
|
||||
x = int(coord.get("x", 0))
|
||||
y = int(coord.get("y", 0))
|
||||
requested_x = self._parse_int(coord.get("x", 0), default=0)
|
||||
requested_y = self._parse_int(coord.get("y", 0), default=0)
|
||||
region = str(args.get("region", "small") or "small").strip().lower()
|
||||
mode = str(args.get("mode", "ui") or "ui").strip().lower()
|
||||
if region not in {"small", "medium", "large"}:
|
||||
region = "small"
|
||||
if mode not in {"ui", "text"}:
|
||||
mode = "ui"
|
||||
|
||||
region_half_by_preset = {
|
||||
"small": 96,
|
||||
"medium": 160,
|
||||
"large": 240,
|
||||
}
|
||||
default_scale_by_region = {
|
||||
"small": 4,
|
||||
"medium": 3,
|
||||
"large": 2,
|
||||
}
|
||||
raw_scale = self._parse_int(args.get("scale"), default=0)
|
||||
scale = raw_scale if raw_scale > 0 else default_scale_by_region[region]
|
||||
scale = clamp(scale, 2, 6)
|
||||
|
||||
base, base_meta = self._capture_screen(with_grid=False)
|
||||
width, height = base.size
|
||||
|
||||
region_half = 180
|
||||
left = clamp(x - region_half, 0, width - 1)
|
||||
top = clamp(y - region_half, 0, height - 1)
|
||||
right = clamp(x + region_half, left + 1, width)
|
||||
bottom = clamp(y + region_half, top + 1, height)
|
||||
source_x = clamp(requested_x, 0, max(0, width - 1))
|
||||
source_y = clamp(requested_y, 0, max(0, height - 1))
|
||||
region_half = region_half_by_preset[region]
|
||||
left = clamp(source_x - region_half, 0, width - 1)
|
||||
top = clamp(source_y - region_half, 0, height - 1)
|
||||
right = clamp(source_x + region_half, left + 1, width)
|
||||
bottom = clamp(source_y + region_half, top + 1, height)
|
||||
|
||||
crop = base.crop((left, top, right, bottom))
|
||||
upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC)
|
||||
enhanced = ImageOps.autocontrast(upscaled)
|
||||
enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0)
|
||||
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
|
||||
enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2))
|
||||
out_w = max(2, crop.width * scale)
|
||||
out_h = max(2, crop.height * scale)
|
||||
upscaled = crop.resize((out_w, out_h), Image.Resampling.LANCZOS)
|
||||
|
||||
out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png"
|
||||
if mode == "text":
|
||||
text_view = ImageOps.grayscale(upscaled)
|
||||
text_view = ImageOps.autocontrast(text_view, cutoff=1)
|
||||
text_view = ImageOps.equalize(text_view)
|
||||
text_view = ImageEnhance.Contrast(text_view).enhance(1.35)
|
||||
text_view = ImageEnhance.Sharpness(text_view).enhance(2.1)
|
||||
processed = text_view.filter(ImageFilter.UnsharpMask(radius=1.2, percent=160, threshold=1)).convert("RGB")
|
||||
else:
|
||||
ui_view = ImageOps.autocontrast(upscaled, cutoff=1)
|
||||
ui_view = ImageEnhance.Contrast(ui_view).enhance(1.2)
|
||||
ui_view = ImageEnhance.Sharpness(ui_view).enhance(1.8)
|
||||
processed = ui_view.filter(ImageFilter.UnsharpMask(radius=1.4, percent=150, threshold=2)).convert("RGB")
|
||||
|
||||
edges = upscaled.convert("L").filter(ImageFilter.FIND_EDGES)
|
||||
edges = ImageOps.autocontrast(edges, cutoff=4)
|
||||
edge_overlay = ImageOps.colorize(edges, black=(0, 0, 0), white=(60, 220, 255))
|
||||
enhanced = Image.blend(processed, edge_overlay, alpha=0.18)
|
||||
|
||||
cx = clamp((source_x - left) * scale, 0, max(0, enhanced.width - 1))
|
||||
cy = clamp((source_y - top) * scale, 0, max(0, enhanced.height - 1))
|
||||
draw = ImageDraw.Draw(enhanced)
|
||||
draw.rectangle([0, 0, enhanced.width - 1, enhanced.height - 1], outline=(255, 80, 80), width=2)
|
||||
ring_radius = max(10, int(6 * scale / 2))
|
||||
arm_len = max(14, int(9 * scale / 2))
|
||||
gap = max(4, int(2 * scale / 2))
|
||||
line_width = max(2, int(scale / 2))
|
||||
draw.ellipse(
|
||||
[cx - ring_radius, cy - ring_radius, cx + ring_radius, cy + ring_radius],
|
||||
outline=(255, 80, 80),
|
||||
width=line_width,
|
||||
)
|
||||
draw.line([(max(0, cx - arm_len), cy), (max(0, cx - gap), cy)], fill=(255, 80, 80), width=line_width)
|
||||
draw.line(
|
||||
[(min(enhanced.width - 1, cx + gap), cy), (min(enhanced.width - 1, cx + arm_len), cy)],
|
||||
fill=(255, 80, 80),
|
||||
width=line_width,
|
||||
)
|
||||
draw.line([(cx, max(0, cy - arm_len)), (cx, max(0, cy - gap))], fill=(255, 80, 80), width=line_width)
|
||||
draw.line(
|
||||
[(cx, min(enhanced.height - 1, cy + gap)), (cx, min(enhanced.height - 1, cy + arm_len))],
|
||||
fill=(255, 80, 80),
|
||||
width=line_width,
|
||||
)
|
||||
|
||||
out_path = self.artifacts.enhance_dir / (
|
||||
f"enhance_step_{self.step:03d}_{source_x}_{source_y}_{region}_{mode}_x{scale}.png"
|
||||
)
|
||||
self._save_image(enhanced, out_path)
|
||||
data_url = image_to_data_url(enhanced, "PNG")
|
||||
|
||||
meta = {
|
||||
"captured_at": utc_now_iso(),
|
||||
"source_coord": {"x": x, "y": y},
|
||||
"requested_coord": {"x": requested_x, "y": requested_y},
|
||||
"source_coord": {"x": source_x, "y": source_y},
|
||||
"source_box": {"left": left, "top": top, "right": right, "bottom": bottom},
|
||||
"scale": 2,
|
||||
"region": region,
|
||||
"mode": mode,
|
||||
"scale": scale,
|
||||
"path": str(out_path.resolve()),
|
||||
"size": {"width": enhanced.width, "height": enhanced.height},
|
||||
"target_pixel": {"x": cx, "y": cy},
|
||||
"screen_size": {"width": width, "height": height},
|
||||
"base_capture_meta": base_meta,
|
||||
}
|
||||
@@ -628,6 +736,9 @@ class ScreenJobAgent:
|
||||
return {"_raw": raw}
|
||||
|
||||
def _call_model(self, input_items: list[dict[str, Any]]) -> Any:
|
||||
effort = str(self.options.reasoning_effort or "medium").strip().lower()
|
||||
if effort not in {"low", "medium", "high"}:
|
||||
effort = "medium"
|
||||
return self.client.responses.create(
|
||||
model=self.options.model,
|
||||
instructions=SYSTEM_PROMPT,
|
||||
@@ -636,9 +747,85 @@ class ScreenJobAgent:
|
||||
previous_response_id=self.previous_response_id,
|
||||
parallel_tool_calls=True,
|
||||
max_tool_calls=8,
|
||||
reasoning={"effort": effort},
|
||||
)
|
||||
|
||||
def _record_tool_summary(self, tool_name: str, result: dict[str, Any]) -> None:
|
||||
ok = bool(result.get("ok"))
|
||||
status = "ok" if ok else "fail"
|
||||
summary = f"step={self.step} tool={tool_name} status={status}"
|
||||
if tool_name == "click":
|
||||
clicked = result.get("clicked") if isinstance(result.get("clicked"), dict) else {}
|
||||
x = clicked.get("x")
|
||||
y = clicked.get("y")
|
||||
if isinstance(x, int) and isinstance(y, int):
|
||||
summary = f"{summary} at=({x},{y})"
|
||||
elif tool_name == "type":
|
||||
typed_length = int(result.get("typed_length", 0) or 0)
|
||||
summary = f"{summary} typed_length={typed_length}"
|
||||
elif tool_name == "press_key":
|
||||
key = str(result.get("key") or "").strip()
|
||||
if key:
|
||||
summary = f"{summary} key={key}"
|
||||
elif tool_name == "execute_command":
|
||||
exit_code = result.get("exit_code")
|
||||
if exit_code is not None:
|
||||
summary = f"{summary} exit_code={exit_code}"
|
||||
elif tool_name in {"see_screen", "enhance"}:
|
||||
meta = result.get("meta") if isinstance(result.get("meta"), dict) else {}
|
||||
path = str(meta.get("path") or result.get("path") or "").strip()
|
||||
if path:
|
||||
summary = f"{summary} image={path}"
|
||||
if not ok:
|
||||
error_text = str(result.get("error") or "").strip()
|
||||
if error_text:
|
||||
summary = f"{summary} error={error_text[:140]}"
|
||||
self.recent_tool_summaries.append(summary)
|
||||
self.recent_tool_summaries = self.recent_tool_summaries[-20:]
|
||||
|
||||
def _should_compact_context(self) -> bool:
|
||||
interval = max(0, int(self.options.screen_context_decay_steps or 0))
|
||||
if interval <= 0:
|
||||
return False
|
||||
if self.previous_response_id is None:
|
||||
return False
|
||||
return (self.step - self.last_context_compact_step) >= interval
|
||||
|
||||
def _build_compacted_pending_input(self) -> list[dict[str, Any]]:
|
||||
recent = self.recent_tool_summaries[-8:]
|
||||
lines = "\n".join(f"- {line}" for line in recent) if recent else "- No recent tool activity."
|
||||
content = (
|
||||
"Context compaction activated to decay stale screenshots and reduce token usage.\n"
|
||||
f"JOB: {self.objective}\n"
|
||||
f"Current step: {self.step}\n"
|
||||
"Recent tool activity:\n"
|
||||
f"{lines}\n"
|
||||
"Continue execution from the latest screen state. "
|
||||
"Use tools only, and finish with task_complete when done."
|
||||
)
|
||||
compacted_input: list[dict[str, Any]] = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": content,
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
if self.last_screen_data_url and self.last_screen_meta:
|
||||
compacted_input.append(
|
||||
self._build_visual_message(
|
||||
"Current screen after context compaction",
|
||||
self.last_screen_data_url,
|
||||
self.last_screen_meta,
|
||||
)
|
||||
)
|
||||
return compacted_input
|
||||
|
||||
def run(self, job: str) -> AgentResult:
|
||||
self.objective = job
|
||||
started_at = time.time()
|
||||
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model)
|
||||
self.logger.info("Job: %s", job)
|
||||
@@ -648,6 +835,8 @@ class ScreenJobAgent:
|
||||
{
|
||||
"run_id": self.artifacts.run_id,
|
||||
"model": self.options.model,
|
||||
"reasoning_effort": self.options.reasoning_effort,
|
||||
"screen_context_decay_steps": self.options.screen_context_decay_steps,
|
||||
"objective": job,
|
||||
"disabled_tools": sorted(self.disabled_tools),
|
||||
},
|
||||
@@ -664,6 +853,8 @@ class ScreenJobAgent:
|
||||
f"JOB: {job}\n"
|
||||
"You are in an action loop. Prefer execute_command for deterministic actions. "
|
||||
"For modifier shortcuts, use a single press_key combo (example: win+r). "
|
||||
"Before clicking tiny buttons/icons or dense UI areas, call enhance first "
|
||||
"(use region='small'; use mode='text' for tiny text labels). "
|
||||
"You can return multiple tool calls in one step (example: click then sleep). "
|
||||
"When done call task_complete(return=..., data=...). "
|
||||
"Before task_complete, verify the screen content is what was expected "
|
||||
@@ -692,6 +883,19 @@ class ScreenJobAgent:
|
||||
self.step += 1
|
||||
self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps)
|
||||
self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps})
|
||||
if self._should_compact_context():
|
||||
self.previous_response_id = None
|
||||
pending_input = self._build_compacted_pending_input()
|
||||
self.last_context_compact_step = self.step
|
||||
self.logger.info("Compacted model context at step %d.", self.step)
|
||||
self._emit(
|
||||
"context_compacted",
|
||||
{
|
||||
"step": self.step,
|
||||
"decay_steps": self.options.screen_context_decay_steps,
|
||||
"recent_tool_summaries": self.recent_tool_summaries[-8:],
|
||||
},
|
||||
)
|
||||
try:
|
||||
response = self._call_model(pending_input)
|
||||
self._register_usage(response)
|
||||
@@ -720,6 +924,8 @@ class ScreenJobAgent:
|
||||
"text": (
|
||||
"No function call was returned. Continue by using tools. "
|
||||
"Use one press_key call for key combos like win+r. "
|
||||
"Prefer enhance before clicking small/unclear targets "
|
||||
"(region='small', mode='ui' or 'text'). "
|
||||
"You may call multiple tools in one step. "
|
||||
"Before task_complete, verify expected screen content with see_screen/enhance "
|
||||
"and include observed_result in data. "
|
||||
@@ -763,6 +969,7 @@ class ScreenJobAgent:
|
||||
name,
|
||||
json.dumps(result, ensure_ascii=False)[:2500],
|
||||
)
|
||||
self._record_tool_summary(name, result)
|
||||
self._emit("tool_result", {"step": self.step, "tool": name, "result": result})
|
||||
next_input.append(
|
||||
{
|
||||
|
||||
14
src/cli.py
14
src/cli.py
@@ -28,6 +28,18 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
parser.add_argument("--command-timeout", type=int, default=45, help="Timeout in seconds for execute_command.")
|
||||
parser.add_argument("--type-interval", type=float, default=0.02, help="Seconds between typed characters.")
|
||||
parser.add_argument("--click-pause", type=float, default=0.10, help="Mouse move duration before click.")
|
||||
parser.add_argument(
|
||||
"--reasoning-effort",
|
||||
choices=["low", "medium", "high"],
|
||||
default="medium",
|
||||
help="Reasoning effort passed to the model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--screen-context-decay-steps",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Compact model context every N steps to decay old screenshots (0 disables).",
|
||||
)
|
||||
parser.add_argument("--disable-tool", action="append", default=[], help="Disable a tool by name.")
|
||||
parser.add_argument("--skip-safety-check", action="store_true", help="Bypass pre-flight safety check.")
|
||||
parser.add_argument("--no-failsafe", action="store_true", help="Disable PyAutoGUI fail-safe.")
|
||||
@@ -78,6 +90,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
command_timeout=args.command_timeout,
|
||||
type_interval=args.type_interval,
|
||||
click_pause=args.click_pause,
|
||||
reasoning_effort=args.reasoning_effort,
|
||||
screen_context_decay_steps=max(0, int(args.screen_context_decay_steps)),
|
||||
disable_tools=set(disabled_tools),
|
||||
)
|
||||
try:
|
||||
|
||||
@@ -58,4 +58,6 @@ class RuntimeOptions:
|
||||
command_timeout: int = 45
|
||||
type_interval: float = 0.02
|
||||
click_pause: float = 0.10
|
||||
reasoning_effort: str = "medium"
|
||||
screen_context_decay_steps: int = 4
|
||||
disable_tools: set[str] | None = None
|
||||
|
||||
@@ -25,6 +25,8 @@ class CreateJobRequest(BaseModel):
|
||||
command_timeout: int = Field(45, ge=1, le=600)
|
||||
type_interval: float = Field(0.02, ge=0.0, le=1.0)
|
||||
click_pause: float = Field(0.10, ge=0.0, le=2.0)
|
||||
reasoning_effort: str = Field("medium", pattern="^(low|medium|high)$")
|
||||
screen_context_decay_steps: int = Field(4, ge=0, le=50)
|
||||
disabled_tools: list[str] = Field(default_factory=list)
|
||||
safety_override: bool = False
|
||||
no_failsafe: bool = False
|
||||
@@ -301,6 +303,8 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
|
||||
command_timeout=payload.command_timeout,
|
||||
type_interval=payload.type_interval,
|
||||
click_pause=payload.click_pause,
|
||||
reasoning_effort=payload.reasoning_effort,
|
||||
screen_context_decay_steps=payload.screen_context_decay_steps,
|
||||
disabled_tools=payload.disabled_tools,
|
||||
safety_override=payload.safety_override,
|
||||
no_failsafe=payload.no_failsafe,
|
||||
|
||||
@@ -48,6 +48,8 @@ class JobManager:
|
||||
command_timeout: int = 45,
|
||||
type_interval: float = 0.02,
|
||||
click_pause: float = 0.10,
|
||||
reasoning_effort: str = "medium",
|
||||
screen_context_decay_steps: int = 4,
|
||||
disabled_tools: list[str] | None = None,
|
||||
safety_override: bool = False,
|
||||
no_failsafe: bool = False,
|
||||
@@ -93,6 +95,8 @@ class JobManager:
|
||||
"command_timeout": command_timeout,
|
||||
"type_interval": type_interval,
|
||||
"click_pause": click_pause,
|
||||
"reasoning_effort": reasoning_effort,
|
||||
"screen_context_decay_steps": screen_context_decay_steps,
|
||||
"no_failsafe": no_failsafe,
|
||||
"cancel_event": cancel_event,
|
||||
},
|
||||
@@ -121,6 +125,8 @@ class JobManager:
|
||||
command_timeout: int,
|
||||
type_interval: float,
|
||||
click_pause: float,
|
||||
reasoning_effort: str,
|
||||
screen_context_decay_steps: int,
|
||||
no_failsafe: bool,
|
||||
cancel_event: threading.Event,
|
||||
) -> None:
|
||||
@@ -218,6 +224,8 @@ class JobManager:
|
||||
command_timeout=command_timeout,
|
||||
type_interval=type_interval,
|
||||
click_pause=click_pause,
|
||||
reasoning_effort=reasoning_effort,
|
||||
screen_context_decay_steps=max(0, int(screen_context_decay_steps)),
|
||||
disable_tools=set(disabled_tools),
|
||||
)
|
||||
try:
|
||||
|
||||
@@ -91,6 +91,41 @@ def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None
|
||||
assert click_result["clicked"] == {"x": 110, "y": 102}
|
||||
|
||||
|
||||
def test_enhance_defaults_to_small_ui_preset(tmp_path: Path, monkeypatch) -> None:
|
||||
agent = _build_agent(tmp_path, monkeypatch)
|
||||
result = agent._tool_enhance({"coordinate": {"x": 100, "y": 120}})
|
||||
|
||||
assert result["ok"] is True
|
||||
meta = result["meta"]
|
||||
assert meta["region"] == "small"
|
||||
assert meta["mode"] == "ui"
|
||||
assert meta["scale"] == 4
|
||||
assert Path(meta["path"]).exists()
|
||||
assert meta["target_pixel"]["x"] >= 0
|
||||
assert meta["target_pixel"]["y"] >= 0
|
||||
|
||||
|
||||
def test_enhance_supports_text_mode_and_scale_clamp(tmp_path: Path, monkeypatch) -> None:
|
||||
agent = _build_agent(tmp_path, monkeypatch)
|
||||
result = agent._tool_enhance(
|
||||
{
|
||||
"coordinate": {"x": -99, "y": 9999},
|
||||
"region": "medium",
|
||||
"mode": "text",
|
||||
"scale": 99,
|
||||
}
|
||||
)
|
||||
|
||||
assert result["ok"] is True
|
||||
meta = result["meta"]
|
||||
assert meta["region"] == "medium"
|
||||
assert meta["mode"] == "text"
|
||||
assert meta["scale"] == 6
|
||||
assert meta["requested_coord"] == {"x": -99, "y": 9999}
|
||||
assert meta["source_coord"] == {"x": 0, "y": 719}
|
||||
assert Path(meta["path"]).exists()
|
||||
|
||||
|
||||
def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None:
|
||||
agent = _build_agent(tmp_path, monkeypatch)
|
||||
result = agent._tool_press_key({"key": "meta+r"})
|
||||
@@ -98,3 +133,21 @@ def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None:
|
||||
assert result["key"] == "win+r"
|
||||
assert result["message"] == "Key combo executed."
|
||||
assert agent_module.pyautogui.last_hotkey == ("win", "r")
|
||||
|
||||
|
||||
def test_context_compaction_trigger_and_payload(tmp_path: Path, monkeypatch) -> None:
|
||||
agent = _build_agent(tmp_path, monkeypatch)
|
||||
agent.objective = "Open settings app"
|
||||
agent.previous_response_id = "resp_123"
|
||||
agent.step = 4
|
||||
agent.last_context_compact_step = 0
|
||||
agent.options.screen_context_decay_steps = 4
|
||||
agent.recent_tool_summaries = ["step=1 tool=see_screen status=ok"]
|
||||
agent.last_screen_data_url = "data:image/png;base64,abc"
|
||||
agent.last_screen_meta = {"width": 1280, "height": 720, "path": "C:/tmp/frame.png"}
|
||||
|
||||
assert agent._should_compact_context() is True
|
||||
compacted = agent._build_compacted_pending_input()
|
||||
assert len(compacted) == 2
|
||||
assert "Context compaction activated" in compacted[0]["content"][0]["text"]
|
||||
assert "Open settings app" in compacted[0]["content"][0]["text"]
|
||||
|
||||
@@ -29,7 +29,10 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
|
||||
def fake_assess_task_safety(*_args, **_kwargs):
|
||||
return True, "safe", {"safe": True}
|
||||
|
||||
captured_kwargs: dict[str, Any] = {}
|
||||
|
||||
def fake_run_job(*_args, **_kwargs):
|
||||
captured_kwargs.update(_kwargs)
|
||||
result = AgentResult(
|
||||
completed=True,
|
||||
result="Done",
|
||||
@@ -66,3 +69,5 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
|
||||
assert payload["response"]["data"] == "file1.txt\nfile2.txt"
|
||||
assert payload["return"] == "Task completed successfully"
|
||||
assert payload["data"] == "file1.txt\nfile2.txt"
|
||||
assert captured_kwargs["options"].reasoning_effort == "medium"
|
||||
assert captured_kwargs["options"].screen_context_decay_steps == 4
|
||||
|
||||
@@ -26,6 +26,8 @@ class FakeJobManager:
|
||||
command_timeout: int = 45,
|
||||
type_interval: float = 0.02,
|
||||
click_pause: float = 0.10,
|
||||
reasoning_effort: str = "medium",
|
||||
screen_context_decay_steps: int = 4,
|
||||
disabled_tools: list[str] | None = None,
|
||||
safety_override: bool = False,
|
||||
no_failsafe: bool = False,
|
||||
@@ -46,6 +48,8 @@ class FakeJobManager:
|
||||
"command_timeout": command_timeout,
|
||||
"type_interval": type_interval,
|
||||
"click_pause": click_pause,
|
||||
"reasoning_effort": reasoning_effort,
|
||||
"screen_context_decay_steps": screen_context_decay_steps,
|
||||
"no_failsafe": no_failsafe,
|
||||
}
|
||||
self._jobs[job_id] = {
|
||||
@@ -189,6 +193,8 @@ def test_create_job_returns_only_job_id_and_defaults_model(tmp_path: Path, monke
|
||||
manager = app.state.manager
|
||||
assert manager.last_submit_payload["model"] == "gpt-5.4-mini"
|
||||
assert manager.last_submit_payload["disabled_tools"] == ["click"]
|
||||
assert manager.last_submit_payload["reasoning_effort"] == "medium"
|
||||
assert manager.last_submit_payload["screen_context_decay_steps"] == 4
|
||||
|
||||
status_res = client.get(f"/api/jobs/{job_id}/status", headers=headers)
|
||||
assert status_res.status_code == 200
|
||||
|
||||
8
todo.md
8
todo.md
@@ -4,12 +4,12 @@
|
||||
- [Bug] Enforce single active desktop-control run (or a strict queue) so concurrent jobs cannot fight over the same mouse/keyboard/screen session.
|
||||
- [Bug] Fix run artifact collisions in `setup_artifacts()` (`run_id` is second-granularity, so two jobs in the same second can share/overwrite the same directory).
|
||||
- [Bug] Remove global logger handler clobbering in `setup_logger()` (`logging.getLogger("screenjob").handlers.clear()` breaks concurrent runs and can redirect logs to the wrong file).
|
||||
- [Bug] More consistent clicks and more uses of enhance images.
|
||||
- [x] More consistent clicks and more uses of enhance images.
|
||||
|
||||
## P1
|
||||
- [Idea] Move ui.py into a seperate html file and js file.
|
||||
- [Idea] Think harder using effort "medium" by default.
|
||||
- [Idea] Decay old screenshots after 3 to 5 steps to save (1) tokens and (2) brain fuck in the agents.
|
||||
- [x] Move ui.py into a seperate html file and js file.
|
||||
- [x] Think harder using effort "medium" by default.
|
||||
- [x] Decay old screenshots after 3 to 5 steps to save (1) tokens and (2) brain fuck in the agents.
|
||||
- [Bug] Validate `disabled_tools` against an allowlist and disallow disabling critical completion flow (`task_complete`) to avoid guaranteed step-limit failures.
|
||||
- [Bug] Improve `execute_command` cancellation/timeout handling to terminate full process trees, not only the parent shell process.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user