diff --git a/src/agent.py b/src/agent.py index 59a3124..20fc2f4 100644 --- a/src/agent.py +++ b/src/agent.py @@ -76,11 +76,14 @@ class ScreenJobAgent: self.final_data: Any | None = None self.previous_response_id: str | None = None self.usage = UsageSummary() + self.objective = "" self.last_screen_data_url: str | None = None self.last_screen_meta: dict[str, Any] | None = None self.click_history: list[tuple[int, int, float]] = [] self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()} + self.recent_tool_summaries: list[str] = [] + self.last_context_compact_step = 0 def _emit(self, event_type: str, payload: dict[str, Any]) -> None: if self.event_callback is None: @@ -642,7 +645,82 @@ class ScreenJobAgent: reasoning={"effort": effort}, ) + def _record_tool_summary(self, tool_name: str, result: dict[str, Any]) -> None: + ok = bool(result.get("ok")) + status = "ok" if ok else "fail" + summary = f"step={self.step} tool={tool_name} status={status}" + if tool_name == "click": + clicked = result.get("clicked") if isinstance(result.get("clicked"), dict) else {} + x = clicked.get("x") + y = clicked.get("y") + if isinstance(x, int) and isinstance(y, int): + summary = f"{summary} at=({x},{y})" + elif tool_name == "type": + typed_length = int(result.get("typed_length", 0) or 0) + summary = f"{summary} typed_length={typed_length}" + elif tool_name == "press_key": + key = str(result.get("key") or "").strip() + if key: + summary = f"{summary} key={key}" + elif tool_name == "execute_command": + exit_code = result.get("exit_code") + if exit_code is not None: + summary = f"{summary} exit_code={exit_code}" + elif tool_name in {"see_screen", "enhance"}: + meta = result.get("meta") if isinstance(result.get("meta"), dict) else {} + path = str(meta.get("path") or result.get("path") or "").strip() + if path: + summary = f"{summary} image={path}" + if not ok: + error_text = str(result.get("error") or "").strip() + if error_text: + summary = f"{summary} error={error_text[:140]}" + self.recent_tool_summaries.append(summary) + self.recent_tool_summaries = self.recent_tool_summaries[-20:] + + def _should_compact_context(self) -> bool: + interval = max(0, int(self.options.screen_context_decay_steps or 0)) + if interval <= 0: + return False + if self.previous_response_id is None: + return False + return (self.step - self.last_context_compact_step) >= interval + + def _build_compacted_pending_input(self) -> list[dict[str, Any]]: + recent = self.recent_tool_summaries[-8:] + lines = "\n".join(f"- {line}" for line in recent) if recent else "- No recent tool activity." + content = ( + "Context compaction activated to decay stale screenshots and reduce token usage.\n" + f"JOB: {self.objective}\n" + f"Current step: {self.step}\n" + "Recent tool activity:\n" + f"{lines}\n" + "Continue execution from the latest screen state. " + "Use tools only, and finish with task_complete when done." + ) + compacted_input: list[dict[str, Any]] = [ + { + "role": "user", + "content": [ + { + "type": "input_text", + "text": content, + } + ], + } + ] + if self.last_screen_data_url and self.last_screen_meta: + compacted_input.append( + self._build_visual_message( + "Current screen after context compaction", + self.last_screen_data_url, + self.last_screen_meta, + ) + ) + return compacted_input + def run(self, job: str) -> AgentResult: + self.objective = job started_at = time.time() self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model) self.logger.info("Job: %s", job) @@ -653,6 +731,7 @@ class ScreenJobAgent: "run_id": self.artifacts.run_id, "model": self.options.model, "reasoning_effort": self.options.reasoning_effort, + "screen_context_decay_steps": self.options.screen_context_decay_steps, "objective": job, "disabled_tools": sorted(self.disabled_tools), }, @@ -697,6 +776,19 @@ class ScreenJobAgent: self.step += 1 self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps) self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps}) + if self._should_compact_context(): + self.previous_response_id = None + pending_input = self._build_compacted_pending_input() + self.last_context_compact_step = self.step + self.logger.info("Compacted model context at step %d.", self.step) + self._emit( + "context_compacted", + { + "step": self.step, + "decay_steps": self.options.screen_context_decay_steps, + "recent_tool_summaries": self.recent_tool_summaries[-8:], + }, + ) try: response = self._call_model(pending_input) self._register_usage(response) @@ -768,6 +860,7 @@ class ScreenJobAgent: name, json.dumps(result, ensure_ascii=False)[:2500], ) + self._record_tool_summary(name, result) self._emit("tool_result", {"step": self.step, "tool": name, "result": result}) next_input.append( { diff --git a/src/cli.py b/src/cli.py index 53b4426..ecccbe4 100644 --- a/src/cli.py +++ b/src/cli.py @@ -34,6 +34,12 @@ def build_parser() -> argparse.ArgumentParser: default="medium", help="Reasoning effort passed to the model.", ) + parser.add_argument( + "--screen-context-decay-steps", + type=int, + default=4, + help="Compact model context every N steps to decay old screenshots (0 disables).", + ) parser.add_argument("--disable-tool", action="append", default=[], help="Disable a tool by name.") parser.add_argument("--skip-safety-check", action="store_true", help="Bypass pre-flight safety check.") parser.add_argument("--no-failsafe", action="store_true", help="Disable PyAutoGUI fail-safe.") @@ -85,6 +91,7 @@ def main(argv: list[str] | None = None) -> int: type_interval=args.type_interval, click_pause=args.click_pause, reasoning_effort=args.reasoning_effort, + screen_context_decay_steps=max(0, int(args.screen_context_decay_steps)), disable_tools=set(disabled_tools), ) try: diff --git a/src/models.py b/src/models.py index 098d323..22bdfc9 100644 --- a/src/models.py +++ b/src/models.py @@ -59,4 +59,5 @@ class RuntimeOptions: type_interval: float = 0.02 click_pause: float = 0.10 reasoning_effort: str = "medium" + screen_context_decay_steps: int = 4 disable_tools: set[str] | None = None diff --git a/src/server.py b/src/server.py index 16a97f3..d27b9ad 100644 --- a/src/server.py +++ b/src/server.py @@ -26,6 +26,7 @@ class CreateJobRequest(BaseModel): type_interval: float = Field(0.02, ge=0.0, le=1.0) click_pause: float = Field(0.10, ge=0.0, le=2.0) reasoning_effort: str = Field("medium", pattern="^(low|medium|high)$") + screen_context_decay_steps: int = Field(4, ge=0, le=50) disabled_tools: list[str] = Field(default_factory=list) safety_override: bool = False no_failsafe: bool = False @@ -303,6 +304,7 @@ def create_app(config: AppConfig | None = None) -> FastAPI: type_interval=payload.type_interval, click_pause=payload.click_pause, reasoning_effort=payload.reasoning_effort, + screen_context_decay_steps=payload.screen_context_decay_steps, disabled_tools=payload.disabled_tools, safety_override=payload.safety_override, no_failsafe=payload.no_failsafe, diff --git a/src/task_manager.py b/src/task_manager.py index 114adee..0fa3157 100644 --- a/src/task_manager.py +++ b/src/task_manager.py @@ -49,6 +49,7 @@ class JobManager: type_interval: float = 0.02, click_pause: float = 0.10, reasoning_effort: str = "medium", + screen_context_decay_steps: int = 4, disabled_tools: list[str] | None = None, safety_override: bool = False, no_failsafe: bool = False, @@ -95,6 +96,7 @@ class JobManager: "type_interval": type_interval, "click_pause": click_pause, "reasoning_effort": reasoning_effort, + "screen_context_decay_steps": screen_context_decay_steps, "no_failsafe": no_failsafe, "cancel_event": cancel_event, }, @@ -124,6 +126,7 @@ class JobManager: type_interval: float, click_pause: float, reasoning_effort: str, + screen_context_decay_steps: int, no_failsafe: bool, cancel_event: threading.Event, ) -> None: @@ -222,6 +225,7 @@ class JobManager: type_interval=type_interval, click_pause=click_pause, reasoning_effort=reasoning_effort, + screen_context_decay_steps=max(0, int(screen_context_decay_steps)), disable_tools=set(disabled_tools), ) try: diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py index 32c0530..963374e 100644 --- a/tests/test_agent_tools.py +++ b/tests/test_agent_tools.py @@ -98,3 +98,21 @@ def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None: assert result["key"] == "win+r" assert result["message"] == "Key combo executed." assert agent_module.pyautogui.last_hotkey == ("win", "r") + + +def test_context_compaction_trigger_and_payload(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.objective = "Open settings app" + agent.previous_response_id = "resp_123" + agent.step = 4 + agent.last_context_compact_step = 0 + agent.options.screen_context_decay_steps = 4 + agent.recent_tool_summaries = ["step=1 tool=see_screen status=ok"] + agent.last_screen_data_url = "data:image/png;base64,abc" + agent.last_screen_meta = {"width": 1280, "height": 720, "path": "C:/tmp/frame.png"} + + assert agent._should_compact_context() is True + compacted = agent._build_compacted_pending_input() + assert len(compacted) == 2 + assert "Context compaction activated" in compacted[0]["content"][0]["text"] + assert "Open settings app" in compacted[0]["content"][0]["text"] diff --git a/tests/test_cli.py b/tests/test_cli.py index 37cc3de..042a420 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -70,3 +70,4 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path assert payload["return"] == "Task completed successfully" assert payload["data"] == "file1.txt\nfile2.txt" assert captured_kwargs["options"].reasoning_effort == "medium" + assert captured_kwargs["options"].screen_context_decay_steps == 4 diff --git a/tests/test_server_api.py b/tests/test_server_api.py index 676bc3a..a3b422a 100644 --- a/tests/test_server_api.py +++ b/tests/test_server_api.py @@ -27,6 +27,7 @@ class FakeJobManager: type_interval: float = 0.02, click_pause: float = 0.10, reasoning_effort: str = "medium", + screen_context_decay_steps: int = 4, disabled_tools: list[str] | None = None, safety_override: bool = False, no_failsafe: bool = False, @@ -48,6 +49,7 @@ class FakeJobManager: "type_interval": type_interval, "click_pause": click_pause, "reasoning_effort": reasoning_effort, + "screen_context_decay_steps": screen_context_decay_steps, "no_failsafe": no_failsafe, } self._jobs[job_id] = { @@ -192,6 +194,7 @@ def test_create_job_returns_only_job_id_and_defaults_model(tmp_path: Path, monke assert manager.last_submit_payload["model"] == "gpt-5.4-mini" assert manager.last_submit_payload["disabled_tools"] == ["click"] assert manager.last_submit_payload["reasoning_effort"] == "medium" + assert manager.last_submit_payload["screen_context_decay_steps"] == 4 status_res = client.get(f"/api/jobs/{job_id}/status", headers=headers) assert status_res.status_code == 200