feat: add final verification screen capture and update response structure

2026-05-27 21:14:20 +02:00
parent 375c1073ec
commit 278f011a6d
8 changed files with 52 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ It lets an LLM use controlled local tools (screen, click, type, shell) to comple
 - Returns structured agent output as:
  - `return`: human-readable completion message
  - `data`: structured payload (for example command output)
+  - `verification`: final screen-capture metadata for completion accuracy checks

 ## Core Features

@@ -93,7 +94,11 @@ CLI JSON output includes both legacy and structured fields:
    "data": "file1.txt\nfile2.txt"
  },
  "return": "Task completed successfully",
-  "data": "file1.txt\nfile2.txt"
+  "data": "file1.txt\nfile2.txt",
+  "verification": {
+    "ok": true,
+    "path": "C:/.../screens/screen_final_verification_step_003.png"
+  }
 }
 ```

@@ -149,6 +154,7 @@ Each job payload includes:
 - `response.return`
 - `response.data`
 - top-level `return` and `data` aliases
+- `verification` (final screenshot path + metadata)

 ### Monitoring UI

@@ -167,6 +173,7 @@ Each job payload includes:
 - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
 - When done, call:
  - `task_complete(return="...", data=...)`
+- A final verification screen capture is always taken automatically on completion.

 `data` should contain useful structured output for the requester (text, object, list, etc.).

--- a/src/agent.py
+++ b/src/agent.py
@@ -41,6 +41,7 @@ Rules:
 9) Keep tool arguments valid JSON and concise.
 10) When objective is fully complete, call task_complete(return="...", data=...).
 11) The "data" field should contain structured output useful for the requester (for example command output text).
+12) Before finishing, always verify outcome with a final screen capture.
 """


@@ -78,6 +79,7 @@ class ScreenJobAgent:
        self.last_screen_meta: dict[str, Any] | None = None
        self.click_history: list[tuple[int, int, float]] = []
        self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
+        self.final_verification: dict[str, Any] | None = None

    def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
        if self.event_callback is None:
@@ -361,6 +363,16 @@ class ScreenJobAgent:
            "message": "Screen captured with coordinate grid.",
        }

+    def _capture_final_verification(self) -> dict[str, Any]:
+        image, meta = self._capture_screen(with_grid=True)
+        out_path = self.artifacts.shots_dir / f"screen_final_verification_step_{self.step:03d}.png"
+        self._save_image(image, out_path)
+        data_url = image_to_data_url(image, "PNG")
+        verification_meta = meta | {"path": str(out_path.resolve()), "final_verification": True}
+        self.last_screen_data_url = data_url
+        self.last_screen_meta = verification_meta
+        return {"ok": True, "path": str(out_path.resolve()), "meta": verification_meta}
+
    def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
        coord = args.get("coordinate") or {}
        x = int(coord.get("x", 0))
@@ -565,7 +577,12 @@ class ScreenJobAgent:
        self.completed = True
        self.final_result = return_text
        self.final_data = data
-        return {"ok": True, "return": return_text, "data": data}
+        try:
+            verification = self._capture_final_verification()
+        except Exception as exc:  # noqa: BLE001
+            verification = {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
+        self.final_verification = verification
+        return {"ok": True, "return": return_text, "data": data, "verification": verification}

    def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
        if name in self.disabled_tools:
@@ -633,6 +650,7 @@ class ScreenJobAgent:
                            "You can return multiple tool calls in one step (example: click then sleep). "
                            "When done call task_complete(return=..., data=...). "
                            "Include useful structured output in data. "
+                            "Always finish with a final screen verification."
                        ),
                    }
                ],
@@ -685,6 +703,7 @@ class ScreenJobAgent:
                                    "No function call was returned. Continue by using tools. "
                                    "You may call multiple tools in one step. "
                                    "When complete, call task_complete(return=..., data=...). "
+                                    "Always finish with a final screen verification."
                                ),
                            }
                        ],
@@ -760,6 +779,7 @@ class ScreenJobAgent:
                result=self.final_result,
                return_message=self.final_result,
                data=self.final_data,
+                verification=self.final_verification,
                steps=self.step,
                started_at=started_at,
                ended_at=ended_at,
@@ -774,6 +794,7 @@ class ScreenJobAgent:
                result="Cancelled by user request.",
                return_message="Cancelled by user request.",
                data=None,
+                verification=self.final_verification,
                steps=self.step,
                started_at=started_at,
                ended_at=ended_at,
@@ -790,6 +811,7 @@ class ScreenJobAgent:
                result=error_text,
                return_message=error_text,
                data=None,
+                verification=self.final_verification,
                steps=self.step,
                started_at=started_at,
                ended_at=ended_at,
@@ -805,6 +827,7 @@ class ScreenJobAgent:
            result=result_text,
            return_message=result_text,
            data=None,
+            verification=self.final_verification,
            steps=self.step,
            started_at=started_at,
            ended_at=ended_at,
--- a/src/cli.py
+++ b/src/cli.py
@@ -108,6 +108,7 @@ def main(argv: list[str] | None = None) -> int:
        "response": {"return": result.return_message, "data": result.data},
        "return": result.return_message,
        "data": result.data,
+        "verification": result.verification,
        "steps": result.steps,
        "elapsed_seconds": round(result.ended_at - result.started_at, 3),
        "artifacts_dir": str(artifacts.root_dir.resolve()),
--- a/src/models.py
+++ b/src/models.py
@@ -21,6 +21,7 @@ class AgentResult:
    result: str
    return_message: str
    data: Any | None
+    verification: dict[str, Any] | None
    steps: int
    started_at: float
    ended_at: float
--- a/src/storage.py
+++ b/src/storage.py
@@ -222,14 +222,15 @@ class HistoryDB:
    def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]:
        fallback_return = str(result or "").strip()
        if not response_json:
-            return {"return": fallback_return, "data": None}
+            return {"return": fallback_return, "data": None, "verification": None}
        try:
            payload = json.loads(response_json)
            if isinstance(payload, dict):
                return {
                    "return": str(payload.get("return") or fallback_return),
                    "data": payload.get("data"),
+                    "verification": payload.get("verification"),
                }
        except Exception:
            pass
-        return {"return": fallback_return, "data": None}
+        return {"return": fallback_return, "data": None, "verification": None}
--- a/src/task_manager.py
+++ b/src/task_manager.py
@@ -160,7 +160,7 @@ class JobManager:
                    ended_at=ended_at,
                    error=error_text,
                    result=error_text,
-                    response_json=json.dumps({"return": error_text, "data": None}, ensure_ascii=False),
+                    response_json=json.dumps({"return": error_text, "data": None, "verification": None}, ensure_ascii=False),
                )
                self._publish(
                    job_id,
@@ -239,7 +239,7 @@ class JobManager:
                ended_at=ended_at,
                error=err,
                result=err,
-                response_json=json.dumps({"return": err, "data": None}, ensure_ascii=False),
+                response_json=json.dumps({"return": err, "data": None, "verification": None}, ensure_ascii=False),
            )
            self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}})
            with self._lock:
@@ -259,6 +259,7 @@ class JobManager:
                {
                    "return": result.return_message,
                    "data": result.data,
+                    "verification": result.verification,
                },
                ensure_ascii=False,
            ),
@@ -283,6 +284,7 @@ class JobManager:
                    "status": status,
                    "result": result.return_message,
                    "response": {"return": result.return_message, "data": result.data},
+                    "verification": result.verification,
                    "error": result.error,
                    "cancelled": result.cancelled,
                    "usage": result.usage.to_dict(),
@@ -346,8 +348,9 @@ class JobManager:
    def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]:
        response = job.get("response")
        if not isinstance(response, dict):
-            response = {"return": str(job.get("result") or ""), "data": None}
+            response = {"return": str(job.get("result") or ""), "data": None, "verification": None}
            job["response"] = response
        job["return"] = str(response.get("return") or "")
        job["data"] = response.get("data")
+        job["verification"] = response.get("verification")
        return job
--- a/tests/test_agent_tools.py
+++ b/tests/test_agent_tools.py
@@ -67,8 +67,13 @@ def test_task_complete_captures_return_and_data(tmp_path: Path, monkeypatch) ->
    assert result["ok"] is True
    assert result["return"] == "Task completed successfully"
    assert result["data"] == "file1\nfile2"
+    assert result["verification"]["ok"] is True
+    verification_path = Path(result["verification"]["path"])
+    assert verification_path.exists()
+    assert verification_path.name.startswith("screen_final_verification_step_")
    assert agent.final_result == "Task completed successfully"
    assert agent.final_data == "file1\nfile2"
+    assert agent.final_verification is not None


 def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None:
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -35,6 +35,7 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
            result="Done",
            return_message="Task completed successfully",
            data="file1.txt\nfile2.txt",
+            verification={"ok": True, "path": "C:/tmp/final.png"},
            steps=3,
            started_at=10.0,
            ended_at=12.5,
@@ -66,3 +67,4 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
    assert payload["response"]["data"] == "file1.txt\nfile2.txt"
    assert payload["return"] == "Task completed successfully"
    assert payload["data"] == "file1.txt\nfile2.txt"
+    assert payload["verification"] == {"ok": True, "path": "C:/tmp/final.png"}