From 595375e1a70f468d784657741e41ae26d4eee95e Mon Sep 17 00:00:00 2001 From: Space-Banane Date: Wed, 27 May 2026 21:23:40 +0200 Subject: [PATCH] refactor: remove verification field from responses and update related documentation --- README.md | 10 ++-------- SKILL.md | 21 +++++++++------------ src/agent.py | 35 +++++++++-------------------------- src/cli.py | 1 - src/models.py | 1 - src/storage.py | 5 ++--- src/task_manager.py | 9 +++------ tests/test_agent_tools.py | 6 +----- tests/test_cli.py | 2 -- 9 files changed, 26 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 0fa1929..259dac8 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,6 @@ It lets an LLM use controlled local tools (screen, click, type, shell) to comple - Returns structured agent output as: - `return`: human-readable completion message - `data`: structured payload (for example command output) - - `verification`: final screen-capture metadata for completion accuracy checks ## Core Features @@ -94,11 +93,7 @@ CLI JSON output includes both legacy and structured fields: "data": "file1.txt\nfile2.txt" }, "return": "Task completed successfully", - "data": "file1.txt\nfile2.txt", - "verification": { - "ok": true, - "path": "C:/.../screens/screen_final_verification_step_003.png" - } + "data": "file1.txt\nfile2.txt" } ``` @@ -154,7 +149,6 @@ Each job payload includes: - `response.return` - `response.data` - top-level `return` and `data` aliases -- `verification` (final screenshot path + metadata) ### Monitoring UI @@ -174,7 +168,7 @@ Each job payload includes: - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`. - When done, call: - `task_complete(return="...", data=...)` -- A final verification screen capture is always taken automatically on completion. +- Before `task_complete`, verify expected on-screen content with `see_screen` (and `enhance` if needed), and include an `observed_result` summary in `data`. `data` should contain useful structured output for the requester (text, object, list, etc.). diff --git a/SKILL.md b/SKILL.md index 89b649e..ea9ac46 100644 --- a/SKILL.md +++ b/SKILL.md @@ -10,7 +10,6 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin - Mouse/keyboard control (`click`, `type`, `press_key`) - Terminal execution (`execute_command`, `sleep`) - Structured completion payload (`task_complete(return=..., data=...)`) -- Automatic final verification screen capture on completion - Safety gate, auth, history, and live monitoring ## Important Environment Note @@ -31,13 +30,19 @@ Agents can use ScreenJob to launch and control GUI workflows, including orchestr 1. Submit job via CLI or API. 2. Agent performs tool loop. -3. Read final `response.return`, `response.data`, and `verification` from job status. +3. Read final `response.return` and `response.data` from job status. Keyboard combo rule: - For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`. - Do not split modifier combos into separate calls. +Verification rule: + +- Before `task_complete`, verify actual on-screen content matches the expected outcome. +- Use `see_screen` (and `enhance` if needed) for this check. +- Include a concise `observed_result` in `data` when completing the task. + ## API Quick Reference Base URL: @@ -85,18 +90,10 @@ Result contract in job payload: "status": "completed", "response": { "return": "Task completed successfully", - "data": "file1.txt\nfile2.txt", - "verification": { - "ok": true, - "path": "C:/.../screens/screen_final_verification_step_006.png" - } + "data": "file1.txt\nfile2.txt" }, "return": "Task completed successfully", - "data": "file1.txt\nfile2.txt", - "verification": { - "ok": true, - "path": "C:/.../screens/screen_final_verification_step_006.png" - } + "data": "file1.txt\nfile2.txt" } ``` diff --git a/src/agent.py b/src/agent.py index dff6a1c..2922ff6 100644 --- a/src/agent.py +++ b/src/agent.py @@ -42,7 +42,8 @@ Rules: 9) Keep tool arguments valid JSON and concise. 10) When objective is fully complete, call task_complete(return="...", data=...). 11) The "data" field should contain structured output useful for the requester (for example command output text). -12) Before finishing, always verify outcome with a final screen capture. +12) Before finishing, verify actual screen content matches the expected outcome. +13) For verification, call see_screen (and enhance if needed), then include a concise observed_result in data. """ @@ -80,7 +81,6 @@ class ScreenJobAgent: self.last_screen_meta: dict[str, Any] | None = None self.click_history: list[tuple[int, int, float]] = [] self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()} - self.final_verification: dict[str, Any] | None = None def _emit(self, event_type: str, payload: dict[str, Any]) -> None: if self.event_callback is None: @@ -367,16 +367,6 @@ class ScreenJobAgent: "message": "Screen captured with coordinate grid.", } - def _capture_final_verification(self) -> dict[str, Any]: - image, meta = self._capture_screen(with_grid=True) - out_path = self.artifacts.shots_dir / f"screen_final_verification_step_{self.step:03d}.png" - self._save_image(image, out_path) - data_url = image_to_data_url(image, "PNG") - verification_meta = meta | {"path": str(out_path.resolve()), "final_verification": True} - self.last_screen_data_url = data_url - self.last_screen_meta = verification_meta - return {"ok": True, "path": str(out_path.resolve()), "meta": verification_meta} - def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]: coord = args.get("coordinate") or {} x = int(coord.get("x", 0)) @@ -608,12 +598,7 @@ class ScreenJobAgent: self.completed = True self.final_result = return_text self.final_data = data - try: - verification = self._capture_final_verification() - except Exception as exc: # noqa: BLE001 - verification = {"ok": False, "error": f"{type(exc).__name__}: {exc}"} - self.final_verification = verification - return {"ok": True, "return": return_text, "data": data, "verification": verification} + return {"ok": True, "return": return_text, "data": data} def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]: if name in self.disabled_tools: @@ -681,8 +666,9 @@ class ScreenJobAgent: "For modifier shortcuts, use a single press_key combo (example: win+r). " "You can return multiple tool calls in one step (example: click then sleep). " "When done call task_complete(return=..., data=...). " - "Include useful structured output in data. " - "Always finish with a final screen verification." + "Before task_complete, verify the screen content is what was expected " + "using see_screen/enhance and include observed_result in data. " + "Include useful structured output in data." ), } ], @@ -735,8 +721,9 @@ class ScreenJobAgent: "No function call was returned. Continue by using tools. " "Use one press_key call for key combos like win+r. " "You may call multiple tools in one step. " - "When complete, call task_complete(return=..., data=...). " - "Always finish with a final screen verification." + "Before task_complete, verify expected screen content with see_screen/enhance " + "and include observed_result in data. " + "When complete, call task_complete(return=..., data=...)." ), } ], @@ -812,7 +799,6 @@ class ScreenJobAgent: result=self.final_result, return_message=self.final_result, data=self.final_data, - verification=self.final_verification, steps=self.step, started_at=started_at, ended_at=ended_at, @@ -827,7 +813,6 @@ class ScreenJobAgent: result="Cancelled by user request.", return_message="Cancelled by user request.", data=None, - verification=self.final_verification, steps=self.step, started_at=started_at, ended_at=ended_at, @@ -844,7 +829,6 @@ class ScreenJobAgent: result=error_text, return_message=error_text, data=None, - verification=self.final_verification, steps=self.step, started_at=started_at, ended_at=ended_at, @@ -860,7 +844,6 @@ class ScreenJobAgent: result=result_text, return_message=result_text, data=None, - verification=self.final_verification, steps=self.step, started_at=started_at, ended_at=ended_at, diff --git a/src/cli.py b/src/cli.py index 7f346f9..bf379ea 100644 --- a/src/cli.py +++ b/src/cli.py @@ -108,7 +108,6 @@ def main(argv: list[str] | None = None) -> int: "response": {"return": result.return_message, "data": result.data}, "return": result.return_message, "data": result.data, - "verification": result.verification, "steps": result.steps, "elapsed_seconds": round(result.ended_at - result.started_at, 3), "artifacts_dir": str(artifacts.root_dir.resolve()), diff --git a/src/models.py b/src/models.py index b35fd61..7a55fca 100644 --- a/src/models.py +++ b/src/models.py @@ -21,7 +21,6 @@ class AgentResult: result: str return_message: str data: Any | None - verification: dict[str, Any] | None steps: int started_at: float ended_at: float diff --git a/src/storage.py b/src/storage.py index e40256b..3abd7eb 100644 --- a/src/storage.py +++ b/src/storage.py @@ -222,15 +222,14 @@ class HistoryDB: def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]: fallback_return = str(result or "").strip() if not response_json: - return {"return": fallback_return, "data": None, "verification": None} + return {"return": fallback_return, "data": None} try: payload = json.loads(response_json) if isinstance(payload, dict): return { "return": str(payload.get("return") or fallback_return), "data": payload.get("data"), - "verification": payload.get("verification"), } except Exception: pass - return {"return": fallback_return, "data": None, "verification": None} + return {"return": fallback_return, "data": None} diff --git a/src/task_manager.py b/src/task_manager.py index 5618fda..3d8cc42 100644 --- a/src/task_manager.py +++ b/src/task_manager.py @@ -160,7 +160,7 @@ class JobManager: ended_at=ended_at, error=error_text, result=error_text, - response_json=json.dumps({"return": error_text, "data": None, "verification": None}, ensure_ascii=False), + response_json=json.dumps({"return": error_text, "data": None}, ensure_ascii=False), ) self._publish( job_id, @@ -239,7 +239,7 @@ class JobManager: ended_at=ended_at, error=err, result=err, - response_json=json.dumps({"return": err, "data": None, "verification": None}, ensure_ascii=False), + response_json=json.dumps({"return": err, "data": None}, ensure_ascii=False), ) self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}}) with self._lock: @@ -259,7 +259,6 @@ class JobManager: { "return": result.return_message, "data": result.data, - "verification": result.verification, }, ensure_ascii=False, ), @@ -284,7 +283,6 @@ class JobManager: "status": status, "result": result.return_message, "response": {"return": result.return_message, "data": result.data}, - "verification": result.verification, "error": result.error, "cancelled": result.cancelled, "usage": result.usage.to_dict(), @@ -348,9 +346,8 @@ class JobManager: def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]: response = job.get("response") if not isinstance(response, dict): - response = {"return": str(job.get("result") or ""), "data": None, "verification": None} + response = {"return": str(job.get("result") or ""), "data": None} job["response"] = response job["return"] = str(response.get("return") or "") job["data"] = response.get("data") - job["verification"] = response.get("verification") return job diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py index 0db146e..32c0530 100644 --- a/tests/test_agent_tools.py +++ b/tests/test_agent_tools.py @@ -71,13 +71,9 @@ def test_task_complete_captures_return_and_data(tmp_path: Path, monkeypatch) -> assert result["ok"] is True assert result["return"] == "Task completed successfully" assert result["data"] == "file1\nfile2" - assert result["verification"]["ok"] is True - verification_path = Path(result["verification"]["path"]) - assert verification_path.exists() - assert verification_path.name.startswith("screen_final_verification_step_") + assert "verification" not in result assert agent.final_result == "Task completed successfully" assert agent.final_data == "file1\nfile2" - assert agent.final_verification is not None def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None: diff --git a/tests/test_cli.py b/tests/test_cli.py index 6622b69..f058a97 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -35,7 +35,6 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path result="Done", return_message="Task completed successfully", data="file1.txt\nfile2.txt", - verification={"ok": True, "path": "C:/tmp/final.png"}, steps=3, started_at=10.0, ended_at=12.5, @@ -67,4 +66,3 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path assert payload["response"]["data"] == "file1.txt\nfile2.txt" assert payload["return"] == "Task completed successfully" assert payload["data"] == "file1.txt\nfile2.txt" - assert payload["verification"] == {"ok": True, "path": "C:/tmp/final.png"}