feat: add final verification screen capture and update response structure
All checks were successful
CI / test (push) Successful in 6s

This commit is contained in:
Space-Banane
2026-05-27 21:14:20 +02:00
parent 375c1073ec
commit 278f011a6d
8 changed files with 52 additions and 9 deletions

View File

@@ -12,6 +12,7 @@ It lets an LLM use controlled local tools (screen, click, type, shell) to comple
- Returns structured agent output as: - Returns structured agent output as:
- `return`: human-readable completion message - `return`: human-readable completion message
- `data`: structured payload (for example command output) - `data`: structured payload (for example command output)
- `verification`: final screen-capture metadata for completion accuracy checks
## Core Features ## Core Features
@@ -93,7 +94,11 @@ CLI JSON output includes both legacy and structured fields:
"data": "file1.txt\nfile2.txt" "data": "file1.txt\nfile2.txt"
}, },
"return": "Task completed successfully", "return": "Task completed successfully",
"data": "file1.txt\nfile2.txt" "data": "file1.txt\nfile2.txt",
"verification": {
"ok": true,
"path": "C:/.../screens/screen_final_verification_step_003.png"
}
} }
``` ```
@@ -149,6 +154,7 @@ Each job payload includes:
- `response.return` - `response.return`
- `response.data` - `response.data`
- top-level `return` and `data` aliases - top-level `return` and `data` aliases
- `verification` (final screenshot path + metadata)
### Monitoring UI ### Monitoring UI
@@ -167,6 +173,7 @@ Each job payload includes:
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`. - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
- When done, call: - When done, call:
- `task_complete(return="...", data=...)` - `task_complete(return="...", data=...)`
- A final verification screen capture is always taken automatically on completion.
`data` should contain useful structured output for the requester (text, object, list, etc.). `data` should contain useful structured output for the requester (text, object, list, etc.).

View File

@@ -41,6 +41,7 @@ Rules:
9) Keep tool arguments valid JSON and concise. 9) Keep tool arguments valid JSON and concise.
10) When objective is fully complete, call task_complete(return="...", data=...). 10) When objective is fully complete, call task_complete(return="...", data=...).
11) The "data" field should contain structured output useful for the requester (for example command output text). 11) The "data" field should contain structured output useful for the requester (for example command output text).
12) Before finishing, always verify outcome with a final screen capture.
""" """
@@ -78,6 +79,7 @@ class ScreenJobAgent:
self.last_screen_meta: dict[str, Any] | None = None self.last_screen_meta: dict[str, Any] | None = None
self.click_history: list[tuple[int, int, float]] = [] self.click_history: list[tuple[int, int, float]] = []
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()} self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
self.final_verification: dict[str, Any] | None = None
def _emit(self, event_type: str, payload: dict[str, Any]) -> None: def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
if self.event_callback is None: if self.event_callback is None:
@@ -361,6 +363,16 @@ class ScreenJobAgent:
"message": "Screen captured with coordinate grid.", "message": "Screen captured with coordinate grid.",
} }
def _capture_final_verification(self) -> dict[str, Any]:
image, meta = self._capture_screen(with_grid=True)
out_path = self.artifacts.shots_dir / f"screen_final_verification_step_{self.step:03d}.png"
self._save_image(image, out_path)
data_url = image_to_data_url(image, "PNG")
verification_meta = meta | {"path": str(out_path.resolve()), "final_verification": True}
self.last_screen_data_url = data_url
self.last_screen_meta = verification_meta
return {"ok": True, "path": str(out_path.resolve()), "meta": verification_meta}
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]: def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
coord = args.get("coordinate") or {} coord = args.get("coordinate") or {}
x = int(coord.get("x", 0)) x = int(coord.get("x", 0))
@@ -565,7 +577,12 @@ class ScreenJobAgent:
self.completed = True self.completed = True
self.final_result = return_text self.final_result = return_text
self.final_data = data self.final_data = data
return {"ok": True, "return": return_text, "data": data} try:
verification = self._capture_final_verification()
except Exception as exc: # noqa: BLE001
verification = {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
self.final_verification = verification
return {"ok": True, "return": return_text, "data": data, "verification": verification}
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]: def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
if name in self.disabled_tools: if name in self.disabled_tools:
@@ -632,7 +649,8 @@ class ScreenJobAgent:
"You are in an action loop. Prefer execute_command for deterministic actions. " "You are in an action loop. Prefer execute_command for deterministic actions. "
"You can return multiple tool calls in one step (example: click then sleep). " "You can return multiple tool calls in one step (example: click then sleep). "
"When done call task_complete(return=..., data=...). " "When done call task_complete(return=..., data=...). "
"Include useful structured output in data." "Include useful structured output in data. "
"Always finish with a final screen verification."
), ),
} }
], ],
@@ -684,7 +702,8 @@ class ScreenJobAgent:
"text": ( "text": (
"No function call was returned. Continue by using tools. " "No function call was returned. Continue by using tools. "
"You may call multiple tools in one step. " "You may call multiple tools in one step. "
"When complete, call task_complete(return=..., data=...)." "When complete, call task_complete(return=..., data=...). "
"Always finish with a final screen verification."
), ),
} }
], ],
@@ -760,6 +779,7 @@ class ScreenJobAgent:
result=self.final_result, result=self.final_result,
return_message=self.final_result, return_message=self.final_result,
data=self.final_data, data=self.final_data,
verification=self.final_verification,
steps=self.step, steps=self.step,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,
@@ -774,6 +794,7 @@ class ScreenJobAgent:
result="Cancelled by user request.", result="Cancelled by user request.",
return_message="Cancelled by user request.", return_message="Cancelled by user request.",
data=None, data=None,
verification=self.final_verification,
steps=self.step, steps=self.step,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,
@@ -790,6 +811,7 @@ class ScreenJobAgent:
result=error_text, result=error_text,
return_message=error_text, return_message=error_text,
data=None, data=None,
verification=self.final_verification,
steps=self.step, steps=self.step,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,
@@ -805,6 +827,7 @@ class ScreenJobAgent:
result=result_text, result=result_text,
return_message=result_text, return_message=result_text,
data=None, data=None,
verification=self.final_verification,
steps=self.step, steps=self.step,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,

View File

@@ -108,6 +108,7 @@ def main(argv: list[str] | None = None) -> int:
"response": {"return": result.return_message, "data": result.data}, "response": {"return": result.return_message, "data": result.data},
"return": result.return_message, "return": result.return_message,
"data": result.data, "data": result.data,
"verification": result.verification,
"steps": result.steps, "steps": result.steps,
"elapsed_seconds": round(result.ended_at - result.started_at, 3), "elapsed_seconds": round(result.ended_at - result.started_at, 3),
"artifacts_dir": str(artifacts.root_dir.resolve()), "artifacts_dir": str(artifacts.root_dir.resolve()),

View File

@@ -21,6 +21,7 @@ class AgentResult:
result: str result: str
return_message: str return_message: str
data: Any | None data: Any | None
verification: dict[str, Any] | None
steps: int steps: int
started_at: float started_at: float
ended_at: float ended_at: float

View File

@@ -222,14 +222,15 @@ class HistoryDB:
def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]: def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]:
fallback_return = str(result or "").strip() fallback_return = str(result or "").strip()
if not response_json: if not response_json:
return {"return": fallback_return, "data": None} return {"return": fallback_return, "data": None, "verification": None}
try: try:
payload = json.loads(response_json) payload = json.loads(response_json)
if isinstance(payload, dict): if isinstance(payload, dict):
return { return {
"return": str(payload.get("return") or fallback_return), "return": str(payload.get("return") or fallback_return),
"data": payload.get("data"), "data": payload.get("data"),
"verification": payload.get("verification"),
} }
except Exception: except Exception:
pass pass
return {"return": fallback_return, "data": None} return {"return": fallback_return, "data": None, "verification": None}

View File

@@ -160,7 +160,7 @@ class JobManager:
ended_at=ended_at, ended_at=ended_at,
error=error_text, error=error_text,
result=error_text, result=error_text,
response_json=json.dumps({"return": error_text, "data": None}, ensure_ascii=False), response_json=json.dumps({"return": error_text, "data": None, "verification": None}, ensure_ascii=False),
) )
self._publish( self._publish(
job_id, job_id,
@@ -239,7 +239,7 @@ class JobManager:
ended_at=ended_at, ended_at=ended_at,
error=err, error=err,
result=err, result=err,
response_json=json.dumps({"return": err, "data": None}, ensure_ascii=False), response_json=json.dumps({"return": err, "data": None, "verification": None}, ensure_ascii=False),
) )
self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}}) self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}})
with self._lock: with self._lock:
@@ -259,6 +259,7 @@ class JobManager:
{ {
"return": result.return_message, "return": result.return_message,
"data": result.data, "data": result.data,
"verification": result.verification,
}, },
ensure_ascii=False, ensure_ascii=False,
), ),
@@ -283,6 +284,7 @@ class JobManager:
"status": status, "status": status,
"result": result.return_message, "result": result.return_message,
"response": {"return": result.return_message, "data": result.data}, "response": {"return": result.return_message, "data": result.data},
"verification": result.verification,
"error": result.error, "error": result.error,
"cancelled": result.cancelled, "cancelled": result.cancelled,
"usage": result.usage.to_dict(), "usage": result.usage.to_dict(),
@@ -346,8 +348,9 @@ class JobManager:
def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]: def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]:
response = job.get("response") response = job.get("response")
if not isinstance(response, dict): if not isinstance(response, dict):
response = {"return": str(job.get("result") or ""), "data": None} response = {"return": str(job.get("result") or ""), "data": None, "verification": None}
job["response"] = response job["response"] = response
job["return"] = str(response.get("return") or "") job["return"] = str(response.get("return") or "")
job["data"] = response.get("data") job["data"] = response.get("data")
job["verification"] = response.get("verification")
return job return job

View File

@@ -67,8 +67,13 @@ def test_task_complete_captures_return_and_data(tmp_path: Path, monkeypatch) ->
assert result["ok"] is True assert result["ok"] is True
assert result["return"] == "Task completed successfully" assert result["return"] == "Task completed successfully"
assert result["data"] == "file1\nfile2" assert result["data"] == "file1\nfile2"
assert result["verification"]["ok"] is True
verification_path = Path(result["verification"]["path"])
assert verification_path.exists()
assert verification_path.name.startswith("screen_final_verification_step_")
assert agent.final_result == "Task completed successfully" assert agent.final_result == "Task completed successfully"
assert agent.final_data == "file1\nfile2" assert agent.final_data == "file1\nfile2"
assert agent.final_verification is not None
def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None: def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None:

View File

@@ -35,6 +35,7 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
result="Done", result="Done",
return_message="Task completed successfully", return_message="Task completed successfully",
data="file1.txt\nfile2.txt", data="file1.txt\nfile2.txt",
verification={"ok": True, "path": "C:/tmp/final.png"},
steps=3, steps=3,
started_at=10.0, started_at=10.0,
ended_at=12.5, ended_at=12.5,
@@ -66,3 +67,4 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
assert payload["response"]["data"] == "file1.txt\nfile2.txt" assert payload["response"]["data"] == "file1.txt\nfile2.txt"
assert payload["return"] == "Task completed successfully" assert payload["return"] == "Task completed successfully"
assert payload["data"] == "file1.txt\nfile2.txt" assert payload["data"] == "file1.txt\nfile2.txt"
assert payload["verification"] == {"ok": True, "path": "C:/tmp/final.png"}