refactor: remove verification field from responses and update related documentation
All checks were successful
CI / test (push) Successful in 7s

This commit is contained in:
Space-Banane
2026-05-27 21:23:40 +02:00
parent 48a145d147
commit 595375e1a7
9 changed files with 26 additions and 64 deletions

View File

@@ -12,7 +12,6 @@ It lets an LLM use controlled local tools (screen, click, type, shell) to comple
- Returns structured agent output as: - Returns structured agent output as:
- `return`: human-readable completion message - `return`: human-readable completion message
- `data`: structured payload (for example command output) - `data`: structured payload (for example command output)
- `verification`: final screen-capture metadata for completion accuracy checks
## Core Features ## Core Features
@@ -94,11 +93,7 @@ CLI JSON output includes both legacy and structured fields:
"data": "file1.txt\nfile2.txt" "data": "file1.txt\nfile2.txt"
}, },
"return": "Task completed successfully", "return": "Task completed successfully",
"data": "file1.txt\nfile2.txt", "data": "file1.txt\nfile2.txt"
"verification": {
"ok": true,
"path": "C:/.../screens/screen_final_verification_step_003.png"
}
} }
``` ```
@@ -154,7 +149,6 @@ Each job payload includes:
- `response.return` - `response.return`
- `response.data` - `response.data`
- top-level `return` and `data` aliases - top-level `return` and `data` aliases
- `verification` (final screenshot path + metadata)
### Monitoring UI ### Monitoring UI
@@ -174,7 +168,7 @@ Each job payload includes:
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`. - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
- When done, call: - When done, call:
- `task_complete(return="...", data=...)` - `task_complete(return="...", data=...)`
- A final verification screen capture is always taken automatically on completion. - Before `task_complete`, verify expected on-screen content with `see_screen` (and `enhance` if needed), and include an `observed_result` summary in `data`.
`data` should contain useful structured output for the requester (text, object, list, etc.). `data` should contain useful structured output for the requester (text, object, list, etc.).

View File

@@ -10,7 +10,6 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin
- Mouse/keyboard control (`click`, `type`, `press_key`) - Mouse/keyboard control (`click`, `type`, `press_key`)
- Terminal execution (`execute_command`, `sleep`) - Terminal execution (`execute_command`, `sleep`)
- Structured completion payload (`task_complete(return=..., data=...)`) - Structured completion payload (`task_complete(return=..., data=...)`)
- Automatic final verification screen capture on completion
- Safety gate, auth, history, and live monitoring - Safety gate, auth, history, and live monitoring
## Important Environment Note ## Important Environment Note
@@ -31,13 +30,19 @@ Agents can use ScreenJob to launch and control GUI workflows, including orchestr
1. Submit job via CLI or API. 1. Submit job via CLI or API.
2. Agent performs tool loop. 2. Agent performs tool loop.
3. Read final `response.return`, `response.data`, and `verification` from job status. 3. Read final `response.return` and `response.data` from job status.
Keyboard combo rule: Keyboard combo rule:
- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`. - For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
- Do not split modifier combos into separate calls. - Do not split modifier combos into separate calls.
Verification rule:
- Before `task_complete`, verify actual on-screen content matches the expected outcome.
- Use `see_screen` (and `enhance` if needed) for this check.
- Include a concise `observed_result` in `data` when completing the task.
## API Quick Reference ## API Quick Reference
Base URL: Base URL:
@@ -85,18 +90,10 @@ Result contract in job payload:
"status": "completed", "status": "completed",
"response": { "response": {
"return": "Task completed successfully", "return": "Task completed successfully",
"data": "file1.txt\nfile2.txt", "data": "file1.txt\nfile2.txt"
"verification": {
"ok": true,
"path": "C:/.../screens/screen_final_verification_step_006.png"
}
}, },
"return": "Task completed successfully", "return": "Task completed successfully",
"data": "file1.txt\nfile2.txt", "data": "file1.txt\nfile2.txt"
"verification": {
"ok": true,
"path": "C:/.../screens/screen_final_verification_step_006.png"
}
} }
``` ```

View File

@@ -42,7 +42,8 @@ Rules:
9) Keep tool arguments valid JSON and concise. 9) Keep tool arguments valid JSON and concise.
10) When objective is fully complete, call task_complete(return="...", data=...). 10) When objective is fully complete, call task_complete(return="...", data=...).
11) The "data" field should contain structured output useful for the requester (for example command output text). 11) The "data" field should contain structured output useful for the requester (for example command output text).
12) Before finishing, always verify outcome with a final screen capture. 12) Before finishing, verify actual screen content matches the expected outcome.
13) For verification, call see_screen (and enhance if needed), then include a concise observed_result in data.
""" """
@@ -80,7 +81,6 @@ class ScreenJobAgent:
self.last_screen_meta: dict[str, Any] | None = None self.last_screen_meta: dict[str, Any] | None = None
self.click_history: list[tuple[int, int, float]] = [] self.click_history: list[tuple[int, int, float]] = []
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()} self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
self.final_verification: dict[str, Any] | None = None
def _emit(self, event_type: str, payload: dict[str, Any]) -> None: def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
if self.event_callback is None: if self.event_callback is None:
@@ -367,16 +367,6 @@ class ScreenJobAgent:
"message": "Screen captured with coordinate grid.", "message": "Screen captured with coordinate grid.",
} }
def _capture_final_verification(self) -> dict[str, Any]:
image, meta = self._capture_screen(with_grid=True)
out_path = self.artifacts.shots_dir / f"screen_final_verification_step_{self.step:03d}.png"
self._save_image(image, out_path)
data_url = image_to_data_url(image, "PNG")
verification_meta = meta | {"path": str(out_path.resolve()), "final_verification": True}
self.last_screen_data_url = data_url
self.last_screen_meta = verification_meta
return {"ok": True, "path": str(out_path.resolve()), "meta": verification_meta}
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]: def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
coord = args.get("coordinate") or {} coord = args.get("coordinate") or {}
x = int(coord.get("x", 0)) x = int(coord.get("x", 0))
@@ -608,12 +598,7 @@ class ScreenJobAgent:
self.completed = True self.completed = True
self.final_result = return_text self.final_result = return_text
self.final_data = data self.final_data = data
try: return {"ok": True, "return": return_text, "data": data}
verification = self._capture_final_verification()
except Exception as exc: # noqa: BLE001
verification = {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
self.final_verification = verification
return {"ok": True, "return": return_text, "data": data, "verification": verification}
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]: def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
if name in self.disabled_tools: if name in self.disabled_tools:
@@ -681,8 +666,9 @@ class ScreenJobAgent:
"For modifier shortcuts, use a single press_key combo (example: win+r). " "For modifier shortcuts, use a single press_key combo (example: win+r). "
"You can return multiple tool calls in one step (example: click then sleep). " "You can return multiple tool calls in one step (example: click then sleep). "
"When done call task_complete(return=..., data=...). " "When done call task_complete(return=..., data=...). "
"Before task_complete, verify the screen content is what was expected "
"using see_screen/enhance and include observed_result in data. "
"Include useful structured output in data." "Include useful structured output in data."
"Always finish with a final screen verification."
), ),
} }
], ],
@@ -735,8 +721,9 @@ class ScreenJobAgent:
"No function call was returned. Continue by using tools. " "No function call was returned. Continue by using tools. "
"Use one press_key call for key combos like win+r. " "Use one press_key call for key combos like win+r. "
"You may call multiple tools in one step. " "You may call multiple tools in one step. "
"Before task_complete, verify expected screen content with see_screen/enhance "
"and include observed_result in data. "
"When complete, call task_complete(return=..., data=...)." "When complete, call task_complete(return=..., data=...)."
"Always finish with a final screen verification."
), ),
} }
], ],
@@ -812,7 +799,6 @@ class ScreenJobAgent:
result=self.final_result, result=self.final_result,
return_message=self.final_result, return_message=self.final_result,
data=self.final_data, data=self.final_data,
verification=self.final_verification,
steps=self.step, steps=self.step,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,
@@ -827,7 +813,6 @@ class ScreenJobAgent:
result="Cancelled by user request.", result="Cancelled by user request.",
return_message="Cancelled by user request.", return_message="Cancelled by user request.",
data=None, data=None,
verification=self.final_verification,
steps=self.step, steps=self.step,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,
@@ -844,7 +829,6 @@ class ScreenJobAgent:
result=error_text, result=error_text,
return_message=error_text, return_message=error_text,
data=None, data=None,
verification=self.final_verification,
steps=self.step, steps=self.step,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,
@@ -860,7 +844,6 @@ class ScreenJobAgent:
result=result_text, result=result_text,
return_message=result_text, return_message=result_text,
data=None, data=None,
verification=self.final_verification,
steps=self.step, steps=self.step,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,

View File

@@ -108,7 +108,6 @@ def main(argv: list[str] | None = None) -> int:
"response": {"return": result.return_message, "data": result.data}, "response": {"return": result.return_message, "data": result.data},
"return": result.return_message, "return": result.return_message,
"data": result.data, "data": result.data,
"verification": result.verification,
"steps": result.steps, "steps": result.steps,
"elapsed_seconds": round(result.ended_at - result.started_at, 3), "elapsed_seconds": round(result.ended_at - result.started_at, 3),
"artifacts_dir": str(artifacts.root_dir.resolve()), "artifacts_dir": str(artifacts.root_dir.resolve()),

View File

@@ -21,7 +21,6 @@ class AgentResult:
result: str result: str
return_message: str return_message: str
data: Any | None data: Any | None
verification: dict[str, Any] | None
steps: int steps: int
started_at: float started_at: float
ended_at: float ended_at: float

View File

@@ -222,15 +222,14 @@ class HistoryDB:
def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]: def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]:
fallback_return = str(result or "").strip() fallback_return = str(result or "").strip()
if not response_json: if not response_json:
return {"return": fallback_return, "data": None, "verification": None} return {"return": fallback_return, "data": None}
try: try:
payload = json.loads(response_json) payload = json.loads(response_json)
if isinstance(payload, dict): if isinstance(payload, dict):
return { return {
"return": str(payload.get("return") or fallback_return), "return": str(payload.get("return") or fallback_return),
"data": payload.get("data"), "data": payload.get("data"),
"verification": payload.get("verification"),
} }
except Exception: except Exception:
pass pass
return {"return": fallback_return, "data": None, "verification": None} return {"return": fallback_return, "data": None}

View File

@@ -160,7 +160,7 @@ class JobManager:
ended_at=ended_at, ended_at=ended_at,
error=error_text, error=error_text,
result=error_text, result=error_text,
response_json=json.dumps({"return": error_text, "data": None, "verification": None}, ensure_ascii=False), response_json=json.dumps({"return": error_text, "data": None}, ensure_ascii=False),
) )
self._publish( self._publish(
job_id, job_id,
@@ -239,7 +239,7 @@ class JobManager:
ended_at=ended_at, ended_at=ended_at,
error=err, error=err,
result=err, result=err,
response_json=json.dumps({"return": err, "data": None, "verification": None}, ensure_ascii=False), response_json=json.dumps({"return": err, "data": None}, ensure_ascii=False),
) )
self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}}) self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}})
with self._lock: with self._lock:
@@ -259,7 +259,6 @@ class JobManager:
{ {
"return": result.return_message, "return": result.return_message,
"data": result.data, "data": result.data,
"verification": result.verification,
}, },
ensure_ascii=False, ensure_ascii=False,
), ),
@@ -284,7 +283,6 @@ class JobManager:
"status": status, "status": status,
"result": result.return_message, "result": result.return_message,
"response": {"return": result.return_message, "data": result.data}, "response": {"return": result.return_message, "data": result.data},
"verification": result.verification,
"error": result.error, "error": result.error,
"cancelled": result.cancelled, "cancelled": result.cancelled,
"usage": result.usage.to_dict(), "usage": result.usage.to_dict(),
@@ -348,9 +346,8 @@ class JobManager:
def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]: def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]:
response = job.get("response") response = job.get("response")
if not isinstance(response, dict): if not isinstance(response, dict):
response = {"return": str(job.get("result") or ""), "data": None, "verification": None} response = {"return": str(job.get("result") or ""), "data": None}
job["response"] = response job["response"] = response
job["return"] = str(response.get("return") or "") job["return"] = str(response.get("return") or "")
job["data"] = response.get("data") job["data"] = response.get("data")
job["verification"] = response.get("verification")
return job return job

View File

@@ -71,13 +71,9 @@ def test_task_complete_captures_return_and_data(tmp_path: Path, monkeypatch) ->
assert result["ok"] is True assert result["ok"] is True
assert result["return"] == "Task completed successfully" assert result["return"] == "Task completed successfully"
assert result["data"] == "file1\nfile2" assert result["data"] == "file1\nfile2"
assert result["verification"]["ok"] is True assert "verification" not in result
verification_path = Path(result["verification"]["path"])
assert verification_path.exists()
assert verification_path.name.startswith("screen_final_verification_step_")
assert agent.final_result == "Task completed successfully" assert agent.final_result == "Task completed successfully"
assert agent.final_data == "file1\nfile2" assert agent.final_data == "file1\nfile2"
assert agent.final_verification is not None
def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None: def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None:

View File

@@ -35,7 +35,6 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
result="Done", result="Done",
return_message="Task completed successfully", return_message="Task completed successfully",
data="file1.txt\nfile2.txt", data="file1.txt\nfile2.txt",
verification={"ok": True, "path": "C:/tmp/final.png"},
steps=3, steps=3,
started_at=10.0, started_at=10.0,
ended_at=12.5, ended_at=12.5,
@@ -67,4 +66,3 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
assert payload["response"]["data"] == "file1.txt\nfile2.txt" assert payload["response"]["data"] == "file1.txt\nfile2.txt"
assert payload["return"] == "Task completed successfully" assert payload["return"] == "Task completed successfully"
assert payload["data"] == "file1.txt\nfile2.txt" assert payload["data"] == "file1.txt\nfile2.txt"
assert payload["verification"] == {"ok": True, "path": "C:/tmp/final.png"}