feat: add final verification screen capture and update response structure
All checks were successful
CI / test (push) Successful in 6s

This commit is contained in:
Space-Banane
2026-05-27 21:14:20 +02:00
parent 375c1073ec
commit 278f011a6d
8 changed files with 52 additions and 9 deletions

View File

@@ -41,6 +41,7 @@ Rules:
9) Keep tool arguments valid JSON and concise.
10) When objective is fully complete, call task_complete(return="...", data=...).
11) The "data" field should contain structured output useful for the requester (for example command output text).
12) Before finishing, always verify outcome with a final screen capture.
"""
@@ -78,6 +79,7 @@ class ScreenJobAgent:
self.last_screen_meta: dict[str, Any] | None = None
self.click_history: list[tuple[int, int, float]] = []
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
self.final_verification: dict[str, Any] | None = None
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
if self.event_callback is None:
@@ -361,6 +363,16 @@ class ScreenJobAgent:
"message": "Screen captured with coordinate grid.",
}
def _capture_final_verification(self) -> dict[str, Any]:
image, meta = self._capture_screen(with_grid=True)
out_path = self.artifacts.shots_dir / f"screen_final_verification_step_{self.step:03d}.png"
self._save_image(image, out_path)
data_url = image_to_data_url(image, "PNG")
verification_meta = meta | {"path": str(out_path.resolve()), "final_verification": True}
self.last_screen_data_url = data_url
self.last_screen_meta = verification_meta
return {"ok": True, "path": str(out_path.resolve()), "meta": verification_meta}
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
coord = args.get("coordinate") or {}
x = int(coord.get("x", 0))
@@ -565,7 +577,12 @@ class ScreenJobAgent:
self.completed = True
self.final_result = return_text
self.final_data = data
return {"ok": True, "return": return_text, "data": data}
try:
verification = self._capture_final_verification()
except Exception as exc: # noqa: BLE001
verification = {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
self.final_verification = verification
return {"ok": True, "return": return_text, "data": data, "verification": verification}
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
if name in self.disabled_tools:
@@ -632,7 +649,8 @@ class ScreenJobAgent:
"You are in an action loop. Prefer execute_command for deterministic actions. "
"You can return multiple tool calls in one step (example: click then sleep). "
"When done call task_complete(return=..., data=...). "
"Include useful structured output in data."
"Include useful structured output in data. "
"Always finish with a final screen verification."
),
}
],
@@ -684,7 +702,8 @@ class ScreenJobAgent:
"text": (
"No function call was returned. Continue by using tools. "
"You may call multiple tools in one step. "
"When complete, call task_complete(return=..., data=...)."
"When complete, call task_complete(return=..., data=...). "
"Always finish with a final screen verification."
),
}
],
@@ -760,6 +779,7 @@ class ScreenJobAgent:
result=self.final_result,
return_message=self.final_result,
data=self.final_data,
verification=self.final_verification,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
@@ -774,6 +794,7 @@ class ScreenJobAgent:
result="Cancelled by user request.",
return_message="Cancelled by user request.",
data=None,
verification=self.final_verification,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
@@ -790,6 +811,7 @@ class ScreenJobAgent:
result=error_text,
return_message=error_text,
data=None,
verification=self.final_verification,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
@@ -805,6 +827,7 @@ class ScreenJobAgent:
result=result_text,
return_message=result_text,
data=None,
verification=self.final_verification,
steps=self.step,
started_at=started_at,
ended_at=ended_at,

View File

@@ -108,6 +108,7 @@ def main(argv: list[str] | None = None) -> int:
"response": {"return": result.return_message, "data": result.data},
"return": result.return_message,
"data": result.data,
"verification": result.verification,
"steps": result.steps,
"elapsed_seconds": round(result.ended_at - result.started_at, 3),
"artifacts_dir": str(artifacts.root_dir.resolve()),

View File

@@ -21,6 +21,7 @@ class AgentResult:
result: str
return_message: str
data: Any | None
verification: dict[str, Any] | None
steps: int
started_at: float
ended_at: float

View File

@@ -222,14 +222,15 @@ class HistoryDB:
def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]:
fallback_return = str(result or "").strip()
if not response_json:
return {"return": fallback_return, "data": None}
return {"return": fallback_return, "data": None, "verification": None}
try:
payload = json.loads(response_json)
if isinstance(payload, dict):
return {
"return": str(payload.get("return") or fallback_return),
"data": payload.get("data"),
"verification": payload.get("verification"),
}
except Exception:
pass
return {"return": fallback_return, "data": None}
return {"return": fallback_return, "data": None, "verification": None}

View File

@@ -160,7 +160,7 @@ class JobManager:
ended_at=ended_at,
error=error_text,
result=error_text,
response_json=json.dumps({"return": error_text, "data": None}, ensure_ascii=False),
response_json=json.dumps({"return": error_text, "data": None, "verification": None}, ensure_ascii=False),
)
self._publish(
job_id,
@@ -239,7 +239,7 @@ class JobManager:
ended_at=ended_at,
error=err,
result=err,
response_json=json.dumps({"return": err, "data": None}, ensure_ascii=False),
response_json=json.dumps({"return": err, "data": None, "verification": None}, ensure_ascii=False),
)
self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}})
with self._lock:
@@ -259,6 +259,7 @@ class JobManager:
{
"return": result.return_message,
"data": result.data,
"verification": result.verification,
},
ensure_ascii=False,
),
@@ -283,6 +284,7 @@ class JobManager:
"status": status,
"result": result.return_message,
"response": {"return": result.return_message, "data": result.data},
"verification": result.verification,
"error": result.error,
"cancelled": result.cancelled,
"usage": result.usage.to_dict(),
@@ -346,8 +348,9 @@ class JobManager:
def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]:
response = job.get("response")
if not isinstance(response, dict):
response = {"return": str(job.get("result") or ""), "data": None}
response = {"return": str(job.get("result") or ""), "data": None, "verification": None}
job["response"] = response
job["return"] = str(response.get("return") or "")
job["data"] = response.get("data")
job["verification"] = response.get("verification")
return job