From 595375e1a70f468d784657741e41ae26d4eee95e Mon Sep 17 00:00:00 2001
From: Space-Banane <paulwaehner923@gmail.com>
Date: Wed, 27 May 2026 21:23:40 +0200
Subject: [PATCH] refactor: remove verification field from responses and update
 related documentation

---
 README.md                 | 10 ++--------
 SKILL.md                  | 21 +++++++++------------
 src/agent.py              | 35 +++++++++--------------------------
 src/cli.py                |  1 -
 src/models.py             |  1 -
 src/storage.py            |  5 ++---
 src/task_manager.py       |  9 +++------
 tests/test_agent_tools.py |  6 +-----
 tests/test_cli.py         |  2 --
 9 files changed, 26 insertions(+), 64 deletions(-)

diff --git a/README.md b/README.md
index 0fa1929..259dac8 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,6 @@ It lets an LLM use controlled local tools (screen, click, type, shell) to comple
 - Returns structured agent output as:
   - `return`: human-readable completion message
   - `data`: structured payload (for example command output)
-  - `verification`: final screen-capture metadata for completion accuracy checks
 
 ## Core Features
 
@@ -94,11 +93,7 @@ CLI JSON output includes both legacy and structured fields:
     "data": "file1.txt\nfile2.txt"
   },
   "return": "Task completed successfully",
-  "data": "file1.txt\nfile2.txt",
-  "verification": {
-    "ok": true,
-    "path": "C:/.../screens/screen_final_verification_step_003.png"
-  }
+  "data": "file1.txt\nfile2.txt"
 }
 ```
 
@@ -154,7 +149,6 @@ Each job payload includes:
 - `response.return`
 - `response.data`
 - top-level `return` and `data` aliases
-- `verification` (final screenshot path + metadata)
 
 ### Monitoring UI
 
@@ -174,7 +168,7 @@ Each job payload includes:
 - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
 - When done, call:
   - `task_complete(return="...", data=...)`
-- A final verification screen capture is always taken automatically on completion.
+- Before `task_complete`, verify expected on-screen content with `see_screen` (and `enhance` if needed), and include an `observed_result` summary in `data`.
 
 `data` should contain useful structured output for the requester (text, object, list, etc.).
 
diff --git a/SKILL.md b/SKILL.md
index 89b649e..ea9ac46 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -10,7 +10,6 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin
 - Mouse/keyboard control (`click`, `type`, `press_key`)
 - Terminal execution (`execute_command`, `sleep`)
 - Structured completion payload (`task_complete(return=..., data=...)`)
-- Automatic final verification screen capture on completion
 - Safety gate, auth, history, and live monitoring
 
 ## Important Environment Note
@@ -31,13 +30,19 @@ Agents can use ScreenJob to launch and control GUI workflows, including orchestr
 
 1. Submit job via CLI or API.
 2. Agent performs tool loop.
-3. Read final `response.return`, `response.data`, and `verification` from job status.
+3. Read final `response.return` and `response.data` from job status.
 
 Keyboard combo rule:
 
 - For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
 - Do not split modifier combos into separate calls.
 
+Verification rule:
+
+- Before `task_complete`, verify actual on-screen content matches the expected outcome.
+- Use `see_screen` (and `enhance` if needed) for this check.
+- Include a concise `observed_result` in `data` when completing the task.
+
 ## API Quick Reference
 
 Base URL:
@@ -85,18 +90,10 @@ Result contract in job payload:
   "status": "completed",
   "response": {
     "return": "Task completed successfully",
-    "data": "file1.txt\nfile2.txt",
-    "verification": {
-      "ok": true,
-      "path": "C:/.../screens/screen_final_verification_step_006.png"
-    }
+    "data": "file1.txt\nfile2.txt"
   },
   "return": "Task completed successfully",
-  "data": "file1.txt\nfile2.txt",
-  "verification": {
-    "ok": true,
-    "path": "C:/.../screens/screen_final_verification_step_006.png"
-  }
+  "data": "file1.txt\nfile2.txt"
 }
 ```
 
diff --git a/src/agent.py b/src/agent.py
index dff6a1c..2922ff6 100644
--- a/src/agent.py
+++ b/src/agent.py
@@ -42,7 +42,8 @@ Rules:
 9) Keep tool arguments valid JSON and concise.
 10) When objective is fully complete, call task_complete(return="...", data=...).
 11) The "data" field should contain structured output useful for the requester (for example command output text).
-12) Before finishing, always verify outcome with a final screen capture.
+12) Before finishing, verify actual screen content matches the expected outcome.
+13) For verification, call see_screen (and enhance if needed), then include a concise observed_result in data.
 """
 
 
@@ -80,7 +81,6 @@ class ScreenJobAgent:
         self.last_screen_meta: dict[str, Any] | None = None
         self.click_history: list[tuple[int, int, float]] = []
         self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
-        self.final_verification: dict[str, Any] | None = None
 
     def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
         if self.event_callback is None:
@@ -367,16 +367,6 @@ class ScreenJobAgent:
             "message": "Screen captured with coordinate grid.",
         }
 
-    def _capture_final_verification(self) -> dict[str, Any]:
-        image, meta = self._capture_screen(with_grid=True)
-        out_path = self.artifacts.shots_dir / f"screen_final_verification_step_{self.step:03d}.png"
-        self._save_image(image, out_path)
-        data_url = image_to_data_url(image, "PNG")
-        verification_meta = meta | {"path": str(out_path.resolve()), "final_verification": True}
-        self.last_screen_data_url = data_url
-        self.last_screen_meta = verification_meta
-        return {"ok": True, "path": str(out_path.resolve()), "meta": verification_meta}
-
     def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
         coord = args.get("coordinate") or {}
         x = int(coord.get("x", 0))
@@ -608,12 +598,7 @@ class ScreenJobAgent:
         self.completed = True
         self.final_result = return_text
         self.final_data = data
-        try:
-            verification = self._capture_final_verification()
-        except Exception as exc:  # noqa: BLE001
-            verification = {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
-        self.final_verification = verification
-        return {"ok": True, "return": return_text, "data": data, "verification": verification}
+        return {"ok": True, "return": return_text, "data": data}
 
     def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
         if name in self.disabled_tools:
@@ -681,8 +666,9 @@ class ScreenJobAgent:
                             "For modifier shortcuts, use a single press_key combo (example: win+r). "
                             "You can return multiple tool calls in one step (example: click then sleep). "
                             "When done call task_complete(return=..., data=...). "
-                            "Include useful structured output in data. "
-                            "Always finish with a final screen verification."
+                            "Before task_complete, verify the screen content is what was expected "
+                            "using see_screen/enhance and include observed_result in data. "
+                            "Include useful structured output in data."
                         ),
                     }
                 ],
@@ -735,8 +721,9 @@ class ScreenJobAgent:
                                     "No function call was returned. Continue by using tools. "
                                     "Use one press_key call for key combos like win+r. "
                                     "You may call multiple tools in one step. "
-                                    "When complete, call task_complete(return=..., data=...). "
-                                    "Always finish with a final screen verification."
+                                    "Before task_complete, verify expected screen content with see_screen/enhance "
+                                    "and include observed_result in data. "
+                                    "When complete, call task_complete(return=..., data=...)."
                                 ),
                             }
                         ],
@@ -812,7 +799,6 @@ class ScreenJobAgent:
                 result=self.final_result,
                 return_message=self.final_result,
                 data=self.final_data,
-                verification=self.final_verification,
                 steps=self.step,
                 started_at=started_at,
                 ended_at=ended_at,
@@ -827,7 +813,6 @@ class ScreenJobAgent:
                 result="Cancelled by user request.",
                 return_message="Cancelled by user request.",
                 data=None,
-                verification=self.final_verification,
                 steps=self.step,
                 started_at=started_at,
                 ended_at=ended_at,
@@ -844,7 +829,6 @@ class ScreenJobAgent:
                 result=error_text,
                 return_message=error_text,
                 data=None,
-                verification=self.final_verification,
                 steps=self.step,
                 started_at=started_at,
                 ended_at=ended_at,
@@ -860,7 +844,6 @@ class ScreenJobAgent:
             result=result_text,
             return_message=result_text,
             data=None,
-            verification=self.final_verification,
             steps=self.step,
             started_at=started_at,
             ended_at=ended_at,
diff --git a/src/cli.py b/src/cli.py
index 7f346f9..bf379ea 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -108,7 +108,6 @@ def main(argv: list[str] | None = None) -> int:
         "response": {"return": result.return_message, "data": result.data},
         "return": result.return_message,
         "data": result.data,
-        "verification": result.verification,
         "steps": result.steps,
         "elapsed_seconds": round(result.ended_at - result.started_at, 3),
         "artifacts_dir": str(artifacts.root_dir.resolve()),
diff --git a/src/models.py b/src/models.py
index b35fd61..7a55fca 100644
--- a/src/models.py
+++ b/src/models.py
@@ -21,7 +21,6 @@ class AgentResult:
     result: str
     return_message: str
     data: Any | None
-    verification: dict[str, Any] | None
     steps: int
     started_at: float
     ended_at: float
diff --git a/src/storage.py b/src/storage.py
index e40256b..3abd7eb 100644
--- a/src/storage.py
+++ b/src/storage.py
@@ -222,15 +222,14 @@ class HistoryDB:
     def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]:
         fallback_return = str(result or "").strip()
         if not response_json:
-            return {"return": fallback_return, "data": None, "verification": None}
+            return {"return": fallback_return, "data": None}
         try:
             payload = json.loads(response_json)
             if isinstance(payload, dict):
                 return {
                     "return": str(payload.get("return") or fallback_return),
                     "data": payload.get("data"),
-                    "verification": payload.get("verification"),
                 }
         except Exception:
             pass
-        return {"return": fallback_return, "data": None, "verification": None}
+        return {"return": fallback_return, "data": None}
diff --git a/src/task_manager.py b/src/task_manager.py
index 5618fda..3d8cc42 100644
--- a/src/task_manager.py
+++ b/src/task_manager.py
@@ -160,7 +160,7 @@ class JobManager:
                     ended_at=ended_at,
                     error=error_text,
                     result=error_text,
-                    response_json=json.dumps({"return": error_text, "data": None, "verification": None}, ensure_ascii=False),
+                    response_json=json.dumps({"return": error_text, "data": None}, ensure_ascii=False),
                 )
                 self._publish(
                     job_id,
@@ -239,7 +239,7 @@ class JobManager:
                 ended_at=ended_at,
                 error=err,
                 result=err,
-                response_json=json.dumps({"return": err, "data": None, "verification": None}, ensure_ascii=False),
+                response_json=json.dumps({"return": err, "data": None}, ensure_ascii=False),
             )
             self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}})
             with self._lock:
@@ -259,7 +259,6 @@ class JobManager:
                 {
                     "return": result.return_message,
                     "data": result.data,
-                    "verification": result.verification,
                 },
                 ensure_ascii=False,
             ),
@@ -284,7 +283,6 @@ class JobManager:
                     "status": status,
                     "result": result.return_message,
                     "response": {"return": result.return_message, "data": result.data},
-                    "verification": result.verification,
                     "error": result.error,
                     "cancelled": result.cancelled,
                     "usage": result.usage.to_dict(),
@@ -348,9 +346,8 @@ class JobManager:
     def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]:
         response = job.get("response")
         if not isinstance(response, dict):
-            response = {"return": str(job.get("result") or ""), "data": None, "verification": None}
+            response = {"return": str(job.get("result") or ""), "data": None}
             job["response"] = response
         job["return"] = str(response.get("return") or "")
         job["data"] = response.get("data")
-        job["verification"] = response.get("verification")
         return job
diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py
index 0db146e..32c0530 100644
--- a/tests/test_agent_tools.py
+++ b/tests/test_agent_tools.py
@@ -71,13 +71,9 @@ def test_task_complete_captures_return_and_data(tmp_path: Path, monkeypatch) ->
     assert result["ok"] is True
     assert result["return"] == "Task completed successfully"
     assert result["data"] == "file1\nfile2"
-    assert result["verification"]["ok"] is True
-    verification_path = Path(result["verification"]["path"])
-    assert verification_path.exists()
-    assert verification_path.name.startswith("screen_final_verification_step_")
+    assert "verification" not in result
     assert agent.final_result == "Task completed successfully"
     assert agent.final_data == "file1\nfile2"
-    assert agent.final_verification is not None
 
 
 def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6622b69..f058a97 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -35,7 +35,6 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
             result="Done",
             return_message="Task completed successfully",
             data="file1.txt\nfile2.txt",
-            verification={"ok": True, "path": "C:/tmp/final.png"},
             steps=3,
             started_at=10.0,
             ended_at=12.5,
@@ -67,4 +66,3 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
     assert payload["response"]["data"] == "file1.txt\nfile2.txt"
     assert payload["return"] == "Task completed successfully"
     assert payload["data"] == "file1.txt\nfile2.txt"
-    assert payload["verification"] == {"ok": True, "path": "C:/tmp/final.png"}