refactor: remove verification field from responses and update related documentation
All checks were successful
CI / test (push) Successful in 7s
All checks were successful
CI / test (push) Successful in 7s
This commit is contained in:
10
README.md
10
README.md
@@ -12,7 +12,6 @@ It lets an LLM use controlled local tools (screen, click, type, shell) to comple
|
|||||||
- Returns structured agent output as:
|
- Returns structured agent output as:
|
||||||
- `return`: human-readable completion message
|
- `return`: human-readable completion message
|
||||||
- `data`: structured payload (for example command output)
|
- `data`: structured payload (for example command output)
|
||||||
- `verification`: final screen-capture metadata for completion accuracy checks
|
|
||||||
|
|
||||||
## Core Features
|
## Core Features
|
||||||
|
|
||||||
@@ -94,11 +93,7 @@ CLI JSON output includes both legacy and structured fields:
|
|||||||
"data": "file1.txt\nfile2.txt"
|
"data": "file1.txt\nfile2.txt"
|
||||||
},
|
},
|
||||||
"return": "Task completed successfully",
|
"return": "Task completed successfully",
|
||||||
"data": "file1.txt\nfile2.txt",
|
"data": "file1.txt\nfile2.txt"
|
||||||
"verification": {
|
|
||||||
"ok": true,
|
|
||||||
"path": "C:/.../screens/screen_final_verification_step_003.png"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -154,7 +149,6 @@ Each job payload includes:
|
|||||||
- `response.return`
|
- `response.return`
|
||||||
- `response.data`
|
- `response.data`
|
||||||
- top-level `return` and `data` aliases
|
- top-level `return` and `data` aliases
|
||||||
- `verification` (final screenshot path + metadata)
|
|
||||||
|
|
||||||
### Monitoring UI
|
### Monitoring UI
|
||||||
|
|
||||||
@@ -174,7 +168,7 @@ Each job payload includes:
|
|||||||
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
|
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
|
||||||
- When done, call:
|
- When done, call:
|
||||||
- `task_complete(return="...", data=...)`
|
- `task_complete(return="...", data=...)`
|
||||||
- A final verification screen capture is always taken automatically on completion.
|
- Before `task_complete`, verify expected on-screen content with `see_screen` (and `enhance` if needed), and include an `observed_result` summary in `data`.
|
||||||
|
|
||||||
`data` should contain useful structured output for the requester (text, object, list, etc.).
|
`data` should contain useful structured output for the requester (text, object, list, etc.).
|
||||||
|
|
||||||
|
|||||||
21
SKILL.md
21
SKILL.md
@@ -10,7 +10,6 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin
|
|||||||
- Mouse/keyboard control (`click`, `type`, `press_key`)
|
- Mouse/keyboard control (`click`, `type`, `press_key`)
|
||||||
- Terminal execution (`execute_command`, `sleep`)
|
- Terminal execution (`execute_command`, `sleep`)
|
||||||
- Structured completion payload (`task_complete(return=..., data=...)`)
|
- Structured completion payload (`task_complete(return=..., data=...)`)
|
||||||
- Automatic final verification screen capture on completion
|
|
||||||
- Safety gate, auth, history, and live monitoring
|
- Safety gate, auth, history, and live monitoring
|
||||||
|
|
||||||
## Important Environment Note
|
## Important Environment Note
|
||||||
@@ -31,13 +30,19 @@ Agents can use ScreenJob to launch and control GUI workflows, including orchestr
|
|||||||
|
|
||||||
1. Submit job via CLI or API.
|
1. Submit job via CLI or API.
|
||||||
2. Agent performs tool loop.
|
2. Agent performs tool loop.
|
||||||
3. Read final `response.return`, `response.data`, and `verification` from job status.
|
3. Read final `response.return` and `response.data` from job status.
|
||||||
|
|
||||||
Keyboard combo rule:
|
Keyboard combo rule:
|
||||||
|
|
||||||
- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
|
- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
|
||||||
- Do not split modifier combos into separate calls.
|
- Do not split modifier combos into separate calls.
|
||||||
|
|
||||||
|
Verification rule:
|
||||||
|
|
||||||
|
- Before `task_complete`, verify actual on-screen content matches the expected outcome.
|
||||||
|
- Use `see_screen` (and `enhance` if needed) for this check.
|
||||||
|
- Include a concise `observed_result` in `data` when completing the task.
|
||||||
|
|
||||||
## API Quick Reference
|
## API Quick Reference
|
||||||
|
|
||||||
Base URL:
|
Base URL:
|
||||||
@@ -85,18 +90,10 @@ Result contract in job payload:
|
|||||||
"status": "completed",
|
"status": "completed",
|
||||||
"response": {
|
"response": {
|
||||||
"return": "Task completed successfully",
|
"return": "Task completed successfully",
|
||||||
"data": "file1.txt\nfile2.txt",
|
"data": "file1.txt\nfile2.txt"
|
||||||
"verification": {
|
|
||||||
"ok": true,
|
|
||||||
"path": "C:/.../screens/screen_final_verification_step_006.png"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"return": "Task completed successfully",
|
"return": "Task completed successfully",
|
||||||
"data": "file1.txt\nfile2.txt",
|
"data": "file1.txt\nfile2.txt"
|
||||||
"verification": {
|
|
||||||
"ok": true,
|
|
||||||
"path": "C:/.../screens/screen_final_verification_step_006.png"
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
35
src/agent.py
35
src/agent.py
@@ -42,7 +42,8 @@ Rules:
|
|||||||
9) Keep tool arguments valid JSON and concise.
|
9) Keep tool arguments valid JSON and concise.
|
||||||
10) When objective is fully complete, call task_complete(return="...", data=...).
|
10) When objective is fully complete, call task_complete(return="...", data=...).
|
||||||
11) The "data" field should contain structured output useful for the requester (for example command output text).
|
11) The "data" field should contain structured output useful for the requester (for example command output text).
|
||||||
12) Before finishing, always verify outcome with a final screen capture.
|
12) Before finishing, verify actual screen content matches the expected outcome.
|
||||||
|
13) For verification, call see_screen (and enhance if needed), then include a concise observed_result in data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@@ -80,7 +81,6 @@ class ScreenJobAgent:
|
|||||||
self.last_screen_meta: dict[str, Any] | None = None
|
self.last_screen_meta: dict[str, Any] | None = None
|
||||||
self.click_history: list[tuple[int, int, float]] = []
|
self.click_history: list[tuple[int, int, float]] = []
|
||||||
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
|
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
|
||||||
self.final_verification: dict[str, Any] | None = None
|
|
||||||
|
|
||||||
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
|
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
|
||||||
if self.event_callback is None:
|
if self.event_callback is None:
|
||||||
@@ -367,16 +367,6 @@ class ScreenJobAgent:
|
|||||||
"message": "Screen captured with coordinate grid.",
|
"message": "Screen captured with coordinate grid.",
|
||||||
}
|
}
|
||||||
|
|
||||||
def _capture_final_verification(self) -> dict[str, Any]:
|
|
||||||
image, meta = self._capture_screen(with_grid=True)
|
|
||||||
out_path = self.artifacts.shots_dir / f"screen_final_verification_step_{self.step:03d}.png"
|
|
||||||
self._save_image(image, out_path)
|
|
||||||
data_url = image_to_data_url(image, "PNG")
|
|
||||||
verification_meta = meta | {"path": str(out_path.resolve()), "final_verification": True}
|
|
||||||
self.last_screen_data_url = data_url
|
|
||||||
self.last_screen_meta = verification_meta
|
|
||||||
return {"ok": True, "path": str(out_path.resolve()), "meta": verification_meta}
|
|
||||||
|
|
||||||
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
|
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||||
coord = args.get("coordinate") or {}
|
coord = args.get("coordinate") or {}
|
||||||
x = int(coord.get("x", 0))
|
x = int(coord.get("x", 0))
|
||||||
@@ -608,12 +598,7 @@ class ScreenJobAgent:
|
|||||||
self.completed = True
|
self.completed = True
|
||||||
self.final_result = return_text
|
self.final_result = return_text
|
||||||
self.final_data = data
|
self.final_data = data
|
||||||
try:
|
return {"ok": True, "return": return_text, "data": data}
|
||||||
verification = self._capture_final_verification()
|
|
||||||
except Exception as exc: # noqa: BLE001
|
|
||||||
verification = {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
|
|
||||||
self.final_verification = verification
|
|
||||||
return {"ok": True, "return": return_text, "data": data, "verification": verification}
|
|
||||||
|
|
||||||
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
|
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
|
||||||
if name in self.disabled_tools:
|
if name in self.disabled_tools:
|
||||||
@@ -681,8 +666,9 @@ class ScreenJobAgent:
|
|||||||
"For modifier shortcuts, use a single press_key combo (example: win+r). "
|
"For modifier shortcuts, use a single press_key combo (example: win+r). "
|
||||||
"You can return multiple tool calls in one step (example: click then sleep). "
|
"You can return multiple tool calls in one step (example: click then sleep). "
|
||||||
"When done call task_complete(return=..., data=...). "
|
"When done call task_complete(return=..., data=...). "
|
||||||
"Include useful structured output in data. "
|
"Before task_complete, verify the screen content is what was expected "
|
||||||
"Always finish with a final screen verification."
|
"using see_screen/enhance and include observed_result in data. "
|
||||||
|
"Include useful structured output in data."
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -735,8 +721,9 @@ class ScreenJobAgent:
|
|||||||
"No function call was returned. Continue by using tools. "
|
"No function call was returned. Continue by using tools. "
|
||||||
"Use one press_key call for key combos like win+r. "
|
"Use one press_key call for key combos like win+r. "
|
||||||
"You may call multiple tools in one step. "
|
"You may call multiple tools in one step. "
|
||||||
"When complete, call task_complete(return=..., data=...). "
|
"Before task_complete, verify expected screen content with see_screen/enhance "
|
||||||
"Always finish with a final screen verification."
|
"and include observed_result in data. "
|
||||||
|
"When complete, call task_complete(return=..., data=...)."
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -812,7 +799,6 @@ class ScreenJobAgent:
|
|||||||
result=self.final_result,
|
result=self.final_result,
|
||||||
return_message=self.final_result,
|
return_message=self.final_result,
|
||||||
data=self.final_data,
|
data=self.final_data,
|
||||||
verification=self.final_verification,
|
|
||||||
steps=self.step,
|
steps=self.step,
|
||||||
started_at=started_at,
|
started_at=started_at,
|
||||||
ended_at=ended_at,
|
ended_at=ended_at,
|
||||||
@@ -827,7 +813,6 @@ class ScreenJobAgent:
|
|||||||
result="Cancelled by user request.",
|
result="Cancelled by user request.",
|
||||||
return_message="Cancelled by user request.",
|
return_message="Cancelled by user request.",
|
||||||
data=None,
|
data=None,
|
||||||
verification=self.final_verification,
|
|
||||||
steps=self.step,
|
steps=self.step,
|
||||||
started_at=started_at,
|
started_at=started_at,
|
||||||
ended_at=ended_at,
|
ended_at=ended_at,
|
||||||
@@ -844,7 +829,6 @@ class ScreenJobAgent:
|
|||||||
result=error_text,
|
result=error_text,
|
||||||
return_message=error_text,
|
return_message=error_text,
|
||||||
data=None,
|
data=None,
|
||||||
verification=self.final_verification,
|
|
||||||
steps=self.step,
|
steps=self.step,
|
||||||
started_at=started_at,
|
started_at=started_at,
|
||||||
ended_at=ended_at,
|
ended_at=ended_at,
|
||||||
@@ -860,7 +844,6 @@ class ScreenJobAgent:
|
|||||||
result=result_text,
|
result=result_text,
|
||||||
return_message=result_text,
|
return_message=result_text,
|
||||||
data=None,
|
data=None,
|
||||||
verification=self.final_verification,
|
|
||||||
steps=self.step,
|
steps=self.step,
|
||||||
started_at=started_at,
|
started_at=started_at,
|
||||||
ended_at=ended_at,
|
ended_at=ended_at,
|
||||||
|
|||||||
@@ -108,7 +108,6 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
"response": {"return": result.return_message, "data": result.data},
|
"response": {"return": result.return_message, "data": result.data},
|
||||||
"return": result.return_message,
|
"return": result.return_message,
|
||||||
"data": result.data,
|
"data": result.data,
|
||||||
"verification": result.verification,
|
|
||||||
"steps": result.steps,
|
"steps": result.steps,
|
||||||
"elapsed_seconds": round(result.ended_at - result.started_at, 3),
|
"elapsed_seconds": round(result.ended_at - result.started_at, 3),
|
||||||
"artifacts_dir": str(artifacts.root_dir.resolve()),
|
"artifacts_dir": str(artifacts.root_dir.resolve()),
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ class AgentResult:
|
|||||||
result: str
|
result: str
|
||||||
return_message: str
|
return_message: str
|
||||||
data: Any | None
|
data: Any | None
|
||||||
verification: dict[str, Any] | None
|
|
||||||
steps: int
|
steps: int
|
||||||
started_at: float
|
started_at: float
|
||||||
ended_at: float
|
ended_at: float
|
||||||
|
|||||||
@@ -222,15 +222,14 @@ class HistoryDB:
|
|||||||
def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]:
|
def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]:
|
||||||
fallback_return = str(result or "").strip()
|
fallback_return = str(result or "").strip()
|
||||||
if not response_json:
|
if not response_json:
|
||||||
return {"return": fallback_return, "data": None, "verification": None}
|
return {"return": fallback_return, "data": None}
|
||||||
try:
|
try:
|
||||||
payload = json.loads(response_json)
|
payload = json.loads(response_json)
|
||||||
if isinstance(payload, dict):
|
if isinstance(payload, dict):
|
||||||
return {
|
return {
|
||||||
"return": str(payload.get("return") or fallback_return),
|
"return": str(payload.get("return") or fallback_return),
|
||||||
"data": payload.get("data"),
|
"data": payload.get("data"),
|
||||||
"verification": payload.get("verification"),
|
|
||||||
}
|
}
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return {"return": fallback_return, "data": None, "verification": None}
|
return {"return": fallback_return, "data": None}
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ class JobManager:
|
|||||||
ended_at=ended_at,
|
ended_at=ended_at,
|
||||||
error=error_text,
|
error=error_text,
|
||||||
result=error_text,
|
result=error_text,
|
||||||
response_json=json.dumps({"return": error_text, "data": None, "verification": None}, ensure_ascii=False),
|
response_json=json.dumps({"return": error_text, "data": None}, ensure_ascii=False),
|
||||||
)
|
)
|
||||||
self._publish(
|
self._publish(
|
||||||
job_id,
|
job_id,
|
||||||
@@ -239,7 +239,7 @@ class JobManager:
|
|||||||
ended_at=ended_at,
|
ended_at=ended_at,
|
||||||
error=err,
|
error=err,
|
||||||
result=err,
|
result=err,
|
||||||
response_json=json.dumps({"return": err, "data": None, "verification": None}, ensure_ascii=False),
|
response_json=json.dumps({"return": err, "data": None}, ensure_ascii=False),
|
||||||
)
|
)
|
||||||
self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}})
|
self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}})
|
||||||
with self._lock:
|
with self._lock:
|
||||||
@@ -259,7 +259,6 @@ class JobManager:
|
|||||||
{
|
{
|
||||||
"return": result.return_message,
|
"return": result.return_message,
|
||||||
"data": result.data,
|
"data": result.data,
|
||||||
"verification": result.verification,
|
|
||||||
},
|
},
|
||||||
ensure_ascii=False,
|
ensure_ascii=False,
|
||||||
),
|
),
|
||||||
@@ -284,7 +283,6 @@ class JobManager:
|
|||||||
"status": status,
|
"status": status,
|
||||||
"result": result.return_message,
|
"result": result.return_message,
|
||||||
"response": {"return": result.return_message, "data": result.data},
|
"response": {"return": result.return_message, "data": result.data},
|
||||||
"verification": result.verification,
|
|
||||||
"error": result.error,
|
"error": result.error,
|
||||||
"cancelled": result.cancelled,
|
"cancelled": result.cancelled,
|
||||||
"usage": result.usage.to_dict(),
|
"usage": result.usage.to_dict(),
|
||||||
@@ -348,9 +346,8 @@ class JobManager:
|
|||||||
def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]:
|
def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]:
|
||||||
response = job.get("response")
|
response = job.get("response")
|
||||||
if not isinstance(response, dict):
|
if not isinstance(response, dict):
|
||||||
response = {"return": str(job.get("result") or ""), "data": None, "verification": None}
|
response = {"return": str(job.get("result") or ""), "data": None}
|
||||||
job["response"] = response
|
job["response"] = response
|
||||||
job["return"] = str(response.get("return") or "")
|
job["return"] = str(response.get("return") or "")
|
||||||
job["data"] = response.get("data")
|
job["data"] = response.get("data")
|
||||||
job["verification"] = response.get("verification")
|
|
||||||
return job
|
return job
|
||||||
|
|||||||
@@ -71,13 +71,9 @@ def test_task_complete_captures_return_and_data(tmp_path: Path, monkeypatch) ->
|
|||||||
assert result["ok"] is True
|
assert result["ok"] is True
|
||||||
assert result["return"] == "Task completed successfully"
|
assert result["return"] == "Task completed successfully"
|
||||||
assert result["data"] == "file1\nfile2"
|
assert result["data"] == "file1\nfile2"
|
||||||
assert result["verification"]["ok"] is True
|
assert "verification" not in result
|
||||||
verification_path = Path(result["verification"]["path"])
|
|
||||||
assert verification_path.exists()
|
|
||||||
assert verification_path.name.startswith("screen_final_verification_step_")
|
|
||||||
assert agent.final_result == "Task completed successfully"
|
assert agent.final_result == "Task completed successfully"
|
||||||
assert agent.final_data == "file1\nfile2"
|
assert agent.final_data == "file1\nfile2"
|
||||||
assert agent.final_verification is not None
|
|
||||||
|
|
||||||
|
|
||||||
def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None:
|
def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None:
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
|
|||||||
result="Done",
|
result="Done",
|
||||||
return_message="Task completed successfully",
|
return_message="Task completed successfully",
|
||||||
data="file1.txt\nfile2.txt",
|
data="file1.txt\nfile2.txt",
|
||||||
verification={"ok": True, "path": "C:/tmp/final.png"},
|
|
||||||
steps=3,
|
steps=3,
|
||||||
started_at=10.0,
|
started_at=10.0,
|
||||||
ended_at=12.5,
|
ended_at=12.5,
|
||||||
@@ -67,4 +66,3 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path
|
|||||||
assert payload["response"]["data"] == "file1.txt\nfile2.txt"
|
assert payload["response"]["data"] == "file1.txt\nfile2.txt"
|
||||||
assert payload["return"] == "Task completed successfully"
|
assert payload["return"] == "Task completed successfully"
|
||||||
assert payload["data"] == "file1.txt\nfile2.txt"
|
assert payload["data"] == "file1.txt\nfile2.txt"
|
||||||
assert payload["verification"] == {"ok": True, "path": "C:/tmp/final.png"}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user