Compact screenshot context every 4 steps by default
This commit is contained in:
93
src/agent.py
93
src/agent.py
@@ -76,11 +76,14 @@ class ScreenJobAgent:
|
|||||||
self.final_data: Any | None = None
|
self.final_data: Any | None = None
|
||||||
self.previous_response_id: str | None = None
|
self.previous_response_id: str | None = None
|
||||||
self.usage = UsageSummary()
|
self.usage = UsageSummary()
|
||||||
|
self.objective = ""
|
||||||
|
|
||||||
self.last_screen_data_url: str | None = None
|
self.last_screen_data_url: str | None = None
|
||||||
self.last_screen_meta: dict[str, Any] | None = None
|
self.last_screen_meta: dict[str, Any] | None = None
|
||||||
self.click_history: list[tuple[int, int, float]] = []
|
self.click_history: list[tuple[int, int, float]] = []
|
||||||
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
|
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
|
||||||
|
self.recent_tool_summaries: list[str] = []
|
||||||
|
self.last_context_compact_step = 0
|
||||||
|
|
||||||
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
|
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
|
||||||
if self.event_callback is None:
|
if self.event_callback is None:
|
||||||
@@ -642,7 +645,82 @@ class ScreenJobAgent:
|
|||||||
reasoning={"effort": effort},
|
reasoning={"effort": effort},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _record_tool_summary(self, tool_name: str, result: dict[str, Any]) -> None:
|
||||||
|
ok = bool(result.get("ok"))
|
||||||
|
status = "ok" if ok else "fail"
|
||||||
|
summary = f"step={self.step} tool={tool_name} status={status}"
|
||||||
|
if tool_name == "click":
|
||||||
|
clicked = result.get("clicked") if isinstance(result.get("clicked"), dict) else {}
|
||||||
|
x = clicked.get("x")
|
||||||
|
y = clicked.get("y")
|
||||||
|
if isinstance(x, int) and isinstance(y, int):
|
||||||
|
summary = f"{summary} at=({x},{y})"
|
||||||
|
elif tool_name == "type":
|
||||||
|
typed_length = int(result.get("typed_length", 0) or 0)
|
||||||
|
summary = f"{summary} typed_length={typed_length}"
|
||||||
|
elif tool_name == "press_key":
|
||||||
|
key = str(result.get("key") or "").strip()
|
||||||
|
if key:
|
||||||
|
summary = f"{summary} key={key}"
|
||||||
|
elif tool_name == "execute_command":
|
||||||
|
exit_code = result.get("exit_code")
|
||||||
|
if exit_code is not None:
|
||||||
|
summary = f"{summary} exit_code={exit_code}"
|
||||||
|
elif tool_name in {"see_screen", "enhance"}:
|
||||||
|
meta = result.get("meta") if isinstance(result.get("meta"), dict) else {}
|
||||||
|
path = str(meta.get("path") or result.get("path") or "").strip()
|
||||||
|
if path:
|
||||||
|
summary = f"{summary} image={path}"
|
||||||
|
if not ok:
|
||||||
|
error_text = str(result.get("error") or "").strip()
|
||||||
|
if error_text:
|
||||||
|
summary = f"{summary} error={error_text[:140]}"
|
||||||
|
self.recent_tool_summaries.append(summary)
|
||||||
|
self.recent_tool_summaries = self.recent_tool_summaries[-20:]
|
||||||
|
|
||||||
|
def _should_compact_context(self) -> bool:
|
||||||
|
interval = max(0, int(self.options.screen_context_decay_steps or 0))
|
||||||
|
if interval <= 0:
|
||||||
|
return False
|
||||||
|
if self.previous_response_id is None:
|
||||||
|
return False
|
||||||
|
return (self.step - self.last_context_compact_step) >= interval
|
||||||
|
|
||||||
|
def _build_compacted_pending_input(self) -> list[dict[str, Any]]:
|
||||||
|
recent = self.recent_tool_summaries[-8:]
|
||||||
|
lines = "\n".join(f"- {line}" for line in recent) if recent else "- No recent tool activity."
|
||||||
|
content = (
|
||||||
|
"Context compaction activated to decay stale screenshots and reduce token usage.\n"
|
||||||
|
f"JOB: {self.objective}\n"
|
||||||
|
f"Current step: {self.step}\n"
|
||||||
|
"Recent tool activity:\n"
|
||||||
|
f"{lines}\n"
|
||||||
|
"Continue execution from the latest screen state. "
|
||||||
|
"Use tools only, and finish with task_complete when done."
|
||||||
|
)
|
||||||
|
compacted_input: list[dict[str, Any]] = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "input_text",
|
||||||
|
"text": content,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
if self.last_screen_data_url and self.last_screen_meta:
|
||||||
|
compacted_input.append(
|
||||||
|
self._build_visual_message(
|
||||||
|
"Current screen after context compaction",
|
||||||
|
self.last_screen_data_url,
|
||||||
|
self.last_screen_meta,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return compacted_input
|
||||||
|
|
||||||
def run(self, job: str) -> AgentResult:
|
def run(self, job: str) -> AgentResult:
|
||||||
|
self.objective = job
|
||||||
started_at = time.time()
|
started_at = time.time()
|
||||||
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model)
|
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model)
|
||||||
self.logger.info("Job: %s", job)
|
self.logger.info("Job: %s", job)
|
||||||
@@ -653,6 +731,7 @@ class ScreenJobAgent:
|
|||||||
"run_id": self.artifacts.run_id,
|
"run_id": self.artifacts.run_id,
|
||||||
"model": self.options.model,
|
"model": self.options.model,
|
||||||
"reasoning_effort": self.options.reasoning_effort,
|
"reasoning_effort": self.options.reasoning_effort,
|
||||||
|
"screen_context_decay_steps": self.options.screen_context_decay_steps,
|
||||||
"objective": job,
|
"objective": job,
|
||||||
"disabled_tools": sorted(self.disabled_tools),
|
"disabled_tools": sorted(self.disabled_tools),
|
||||||
},
|
},
|
||||||
@@ -697,6 +776,19 @@ class ScreenJobAgent:
|
|||||||
self.step += 1
|
self.step += 1
|
||||||
self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps)
|
self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps)
|
||||||
self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps})
|
self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps})
|
||||||
|
if self._should_compact_context():
|
||||||
|
self.previous_response_id = None
|
||||||
|
pending_input = self._build_compacted_pending_input()
|
||||||
|
self.last_context_compact_step = self.step
|
||||||
|
self.logger.info("Compacted model context at step %d.", self.step)
|
||||||
|
self._emit(
|
||||||
|
"context_compacted",
|
||||||
|
{
|
||||||
|
"step": self.step,
|
||||||
|
"decay_steps": self.options.screen_context_decay_steps,
|
||||||
|
"recent_tool_summaries": self.recent_tool_summaries[-8:],
|
||||||
|
},
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
response = self._call_model(pending_input)
|
response = self._call_model(pending_input)
|
||||||
self._register_usage(response)
|
self._register_usage(response)
|
||||||
@@ -768,6 +860,7 @@ class ScreenJobAgent:
|
|||||||
name,
|
name,
|
||||||
json.dumps(result, ensure_ascii=False)[:2500],
|
json.dumps(result, ensure_ascii=False)[:2500],
|
||||||
)
|
)
|
||||||
|
self._record_tool_summary(name, result)
|
||||||
self._emit("tool_result", {"step": self.step, "tool": name, "result": result})
|
self._emit("tool_result", {"step": self.step, "tool": name, "result": result})
|
||||||
next_input.append(
|
next_input.append(
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -34,6 +34,12 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
default="medium",
|
default="medium",
|
||||||
help="Reasoning effort passed to the model.",
|
help="Reasoning effort passed to the model.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--screen-context-decay-steps",
|
||||||
|
type=int,
|
||||||
|
default=4,
|
||||||
|
help="Compact model context every N steps to decay old screenshots (0 disables).",
|
||||||
|
)
|
||||||
parser.add_argument("--disable-tool", action="append", default=[], help="Disable a tool by name.")
|
parser.add_argument("--disable-tool", action="append", default=[], help="Disable a tool by name.")
|
||||||
parser.add_argument("--skip-safety-check", action="store_true", help="Bypass pre-flight safety check.")
|
parser.add_argument("--skip-safety-check", action="store_true", help="Bypass pre-flight safety check.")
|
||||||
parser.add_argument("--no-failsafe", action="store_true", help="Disable PyAutoGUI fail-safe.")
|
parser.add_argument("--no-failsafe", action="store_true", help="Disable PyAutoGUI fail-safe.")
|
||||||
@@ -85,6 +91,7 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
type_interval=args.type_interval,
|
type_interval=args.type_interval,
|
||||||
click_pause=args.click_pause,
|
click_pause=args.click_pause,
|
||||||
reasoning_effort=args.reasoning_effort,
|
reasoning_effort=args.reasoning_effort,
|
||||||
|
screen_context_decay_steps=max(0, int(args.screen_context_decay_steps)),
|
||||||
disable_tools=set(disabled_tools),
|
disable_tools=set(disabled_tools),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -59,4 +59,5 @@ class RuntimeOptions:
|
|||||||
type_interval: float = 0.02
|
type_interval: float = 0.02
|
||||||
click_pause: float = 0.10
|
click_pause: float = 0.10
|
||||||
reasoning_effort: str = "medium"
|
reasoning_effort: str = "medium"
|
||||||
|
screen_context_decay_steps: int = 4
|
||||||
disable_tools: set[str] | None = None
|
disable_tools: set[str] | None = None
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ class CreateJobRequest(BaseModel):
|
|||||||
type_interval: float = Field(0.02, ge=0.0, le=1.0)
|
type_interval: float = Field(0.02, ge=0.0, le=1.0)
|
||||||
click_pause: float = Field(0.10, ge=0.0, le=2.0)
|
click_pause: float = Field(0.10, ge=0.0, le=2.0)
|
||||||
reasoning_effort: str = Field("medium", pattern="^(low|medium|high)$")
|
reasoning_effort: str = Field("medium", pattern="^(low|medium|high)$")
|
||||||
|
screen_context_decay_steps: int = Field(4, ge=0, le=50)
|
||||||
disabled_tools: list[str] = Field(default_factory=list)
|
disabled_tools: list[str] = Field(default_factory=list)
|
||||||
safety_override: bool = False
|
safety_override: bool = False
|
||||||
no_failsafe: bool = False
|
no_failsafe: bool = False
|
||||||
@@ -303,6 +304,7 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
|
|||||||
type_interval=payload.type_interval,
|
type_interval=payload.type_interval,
|
||||||
click_pause=payload.click_pause,
|
click_pause=payload.click_pause,
|
||||||
reasoning_effort=payload.reasoning_effort,
|
reasoning_effort=payload.reasoning_effort,
|
||||||
|
screen_context_decay_steps=payload.screen_context_decay_steps,
|
||||||
disabled_tools=payload.disabled_tools,
|
disabled_tools=payload.disabled_tools,
|
||||||
safety_override=payload.safety_override,
|
safety_override=payload.safety_override,
|
||||||
no_failsafe=payload.no_failsafe,
|
no_failsafe=payload.no_failsafe,
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ class JobManager:
|
|||||||
type_interval: float = 0.02,
|
type_interval: float = 0.02,
|
||||||
click_pause: float = 0.10,
|
click_pause: float = 0.10,
|
||||||
reasoning_effort: str = "medium",
|
reasoning_effort: str = "medium",
|
||||||
|
screen_context_decay_steps: int = 4,
|
||||||
disabled_tools: list[str] | None = None,
|
disabled_tools: list[str] | None = None,
|
||||||
safety_override: bool = False,
|
safety_override: bool = False,
|
||||||
no_failsafe: bool = False,
|
no_failsafe: bool = False,
|
||||||
@@ -95,6 +96,7 @@ class JobManager:
|
|||||||
"type_interval": type_interval,
|
"type_interval": type_interval,
|
||||||
"click_pause": click_pause,
|
"click_pause": click_pause,
|
||||||
"reasoning_effort": reasoning_effort,
|
"reasoning_effort": reasoning_effort,
|
||||||
|
"screen_context_decay_steps": screen_context_decay_steps,
|
||||||
"no_failsafe": no_failsafe,
|
"no_failsafe": no_failsafe,
|
||||||
"cancel_event": cancel_event,
|
"cancel_event": cancel_event,
|
||||||
},
|
},
|
||||||
@@ -124,6 +126,7 @@ class JobManager:
|
|||||||
type_interval: float,
|
type_interval: float,
|
||||||
click_pause: float,
|
click_pause: float,
|
||||||
reasoning_effort: str,
|
reasoning_effort: str,
|
||||||
|
screen_context_decay_steps: int,
|
||||||
no_failsafe: bool,
|
no_failsafe: bool,
|
||||||
cancel_event: threading.Event,
|
cancel_event: threading.Event,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -222,6 +225,7 @@ class JobManager:
|
|||||||
type_interval=type_interval,
|
type_interval=type_interval,
|
||||||
click_pause=click_pause,
|
click_pause=click_pause,
|
||||||
reasoning_effort=reasoning_effort,
|
reasoning_effort=reasoning_effort,
|
||||||
|
screen_context_decay_steps=max(0, int(screen_context_decay_steps)),
|
||||||
disable_tools=set(disabled_tools),
|
disable_tools=set(disabled_tools),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -98,3 +98,21 @@ def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None:
|
|||||||
assert result["key"] == "win+r"
|
assert result["key"] == "win+r"
|
||||||
assert result["message"] == "Key combo executed."
|
assert result["message"] == "Key combo executed."
|
||||||
assert agent_module.pyautogui.last_hotkey == ("win", "r")
|
assert agent_module.pyautogui.last_hotkey == ("win", "r")
|
||||||
|
|
||||||
|
|
||||||
|
def test_context_compaction_trigger_and_payload(tmp_path: Path, monkeypatch) -> None:
|
||||||
|
agent = _build_agent(tmp_path, monkeypatch)
|
||||||
|
agent.objective = "Open settings app"
|
||||||
|
agent.previous_response_id = "resp_123"
|
||||||
|
agent.step = 4
|
||||||
|
agent.last_context_compact_step = 0
|
||||||
|
agent.options.screen_context_decay_steps = 4
|
||||||
|
agent.recent_tool_summaries = ["step=1 tool=see_screen status=ok"]
|
||||||
|
agent.last_screen_data_url = "data:image/png;base64,abc"
|
||||||
|
agent.last_screen_meta = {"width": 1280, "height": 720, "path": "C:/tmp/frame.png"}
|
||||||
|
|
||||||
|
assert agent._should_compact_context() is True
|
||||||
|
compacted = agent._build_compacted_pending_input()
|
||||||
|
assert len(compacted) == 2
|
||||||
|
assert "Context compaction activated" in compacted[0]["content"][0]["text"]
|
||||||
|
assert "Open settings app" in compacted[0]["content"][0]["text"]
|
||||||
|
|||||||
@@ -70,3 +70,4 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
|
|||||||
assert payload["return"] == "Task completed successfully"
|
assert payload["return"] == "Task completed successfully"
|
||||||
assert payload["data"] == "file1.txt\nfile2.txt"
|
assert payload["data"] == "file1.txt\nfile2.txt"
|
||||||
assert captured_kwargs["options"].reasoning_effort == "medium"
|
assert captured_kwargs["options"].reasoning_effort == "medium"
|
||||||
|
assert captured_kwargs["options"].screen_context_decay_steps == 4
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ class FakeJobManager:
|
|||||||
type_interval: float = 0.02,
|
type_interval: float = 0.02,
|
||||||
click_pause: float = 0.10,
|
click_pause: float = 0.10,
|
||||||
reasoning_effort: str = "medium",
|
reasoning_effort: str = "medium",
|
||||||
|
screen_context_decay_steps: int = 4,
|
||||||
disabled_tools: list[str] | None = None,
|
disabled_tools: list[str] | None = None,
|
||||||
safety_override: bool = False,
|
safety_override: bool = False,
|
||||||
no_failsafe: bool = False,
|
no_failsafe: bool = False,
|
||||||
@@ -48,6 +49,7 @@ class FakeJobManager:
|
|||||||
"type_interval": type_interval,
|
"type_interval": type_interval,
|
||||||
"click_pause": click_pause,
|
"click_pause": click_pause,
|
||||||
"reasoning_effort": reasoning_effort,
|
"reasoning_effort": reasoning_effort,
|
||||||
|
"screen_context_decay_steps": screen_context_decay_steps,
|
||||||
"no_failsafe": no_failsafe,
|
"no_failsafe": no_failsafe,
|
||||||
}
|
}
|
||||||
self._jobs[job_id] = {
|
self._jobs[job_id] = {
|
||||||
@@ -192,6 +194,7 @@ def test_create_job_returns_only_job_id_and_defaults_model(tmp_path: Path, monke
|
|||||||
assert manager.last_submit_payload["model"] == "gpt-5.4-mini"
|
assert manager.last_submit_payload["model"] == "gpt-5.4-mini"
|
||||||
assert manager.last_submit_payload["disabled_tools"] == ["click"]
|
assert manager.last_submit_payload["disabled_tools"] == ["click"]
|
||||||
assert manager.last_submit_payload["reasoning_effort"] == "medium"
|
assert manager.last_submit_payload["reasoning_effort"] == "medium"
|
||||||
|
assert manager.last_submit_payload["screen_context_decay_steps"] == 4
|
||||||
|
|
||||||
status_res = client.get(f"/api/jobs/{job_id}/status", headers=headers)
|
status_res = client.get(f"/api/jobs/{job_id}/status", headers=headers)
|
||||||
assert status_res.status_code == 200
|
assert status_res.status_code == 200
|
||||||
|
|||||||
Reference in New Issue
Block a user