From c09f0ee9c0969e0b7550ce22d38e4364d03ee6de Mon Sep 17 00:00:00 2001 From: Space-Banane Date: Wed, 27 May 2026 18:08:52 +0200 Subject: [PATCH] feat: finalize production cleanup with structured agent responses and project governance --- .gitea/workflows/ci.yml | 2 +- CODE_OF_CONDUCT.md | 32 ++++++ CONTRIBUTING.md | 58 ++++++++++ LICENSE | 201 ++++++++++++++++++++++++++++++++ README.md | 236 ++++++++++++++++++++------------------ SKILL.md | 33 ++++++ docker-compose.yml | 17 +++ requirements.txt | 8 ++ src/agent.py | 34 ++++-- src/cli.py | 9 +- src/models.py | 2 + src/storage.py | 19 +++ src/task_manager.py | 28 ++++- tests/test_agent_tools.py | 86 ++++++++++++++ tests/test_cli.py | 68 +++++++++++ tests/test_server_api.py | 9 +- tests/test_storage.py | 21 ++++ 17 files changed, 737 insertions(+), 126 deletions(-) create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 SKILL.md create mode 100644 docker-compose.yml create mode 100644 requirements.txt create mode 100644 tests/test_agent_tools.py create mode 100644 tests/test_cli.py diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 5b8eca8..7076d2c 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install openai pillow python-dotenv fastapi uvicorn pytest httpx + pip install -r requirements.txt - name: Compile check run: | diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..4f25ee5 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,32 @@ +# Code of Conduct + +## Our Pledge + +We want ScreenJob to be an open, respectful, and harassment-free project for everyone. + +## Expected Behavior + +- Be respectful and constructive. +- Assume good intent, ask clarifying questions, and focus on technical outcomes. +- Accept feedback professionally. +- Share credit and document decisions clearly. + +## Unacceptable Behavior + +- Harassment, threats, or intimidation. +- Personal attacks, insults, or discriminatory language. +- Publishing private information without permission. +- Deliberate disruption of project collaboration. + +## Enforcement + +Project maintainers may remove or reject contributions and interactions that violate this Code of Conduct. + +## Reporting + +Report violations by opening a private maintainer contact issue or contacting the project maintainers directly. + +## Attribution + +This policy is adapted from the Contributor Covenant, version 2.1: +https://www.contributor-covenant.org/version/2/1/code_of_conduct.html diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..7f4fbfe --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,58 @@ +# Contributing to ScreenJob + +## Development Setup + +1. Use Python 3.11+. +2. Install dependencies: + +```powershell +pip install -r requirements.txt +``` + +3. Create `.env` with required keys: + - `OPENAI_API_KEY` + - `SCREENJOB_TOKEN` + +## Branch and PR Workflow + +1. Create a feature branch from `main`. +2. Keep changes focused and modular. +3. Add or update tests for behavior changes. +4. Run checks locally before opening a PR: + +```powershell +pytest -q +``` + +5. Open a PR with: + - problem statement + - approach summary + - test evidence + +## Code Guidelines + +- Favor small, single-purpose functions. +- Keep runtime and API behavior deterministic where possible. +- Preserve backward compatibility for external API contracts unless explicitly changing versioned behavior. +- Use clear error messages and structured outputs. +- Do not commit secrets, API keys, or runtime artifacts. + +## Testing Expectations + +- Unit tests for new logic paths. +- API tests for request/response contract changes. +- Regression coverage for bug fixes. + +## Security and Safety + +- Treat safety checks as part of core behavior. +- Do not bypass auth/token requirements in server code. +- Restrict file access to intended directories (for artifacts/endpoints). + +## Reporting Issues + +Use issues for: + +- reproducible bugs +- security concerns (with minimal sensitive detail) +- feature requests with concrete use cases diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..8d968b6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index cee3f59..e373a3e 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,66 @@ # ScreenJob -Desktop-and-terminal task agent with: +ScreenJob is an autonomous desktop-and-terminal execution service. +It lets an LLM use controlled local tools (screen, click, type, shell) to complete GUI-heavy tasks on a real computer. -- CLI runner -- FastAPI job server -- SQLite task history -- WebSocket-powered monitoring UI -- Safety pre-check and per-job tool disable controls -- Live/final token and cost estimation +## What It Solves -## Install +- Runs agent-driven tasks that require a graphical interface. +- Exposes both CLI and HTTP API modes. +- Stores job history and events in SQLite. +- Streams live monitoring updates over WebSocket. +- Returns structured agent output as: + - `return`: human-readable completion message + - `data`: structured payload (for example command output) -```powershell -pip install openai pillow pyautogui python-dotenv fastapi uvicorn +## Core Features + +- Tool-based agent loop (`execute_command`, `see_screen`, `enhance`, `click`, `type`, `press_key`, `sleep`, `task_complete`) +- Safety pre-check with override support +- Per-job tool disable list +- Live/final usage and cost estimates +- Read-only Tailwind monitoring UI +- Persistent job and event history + +## Project Layout + +```text +main.py +screenjob.py +requirements.txt +docker-compose.yml +src/ + agent.py + app_main.py + cli.py + config.py + models.py + pricing.py + runtime.py + safety.py + server.py + storage.py + task_manager.py + ui.py + utils.py +tests/ + test_agent_tools.py + test_pricing.py + test_server_api.py + test_storage.py +.gitea/workflows/ci.yml ``` -## Environment +## Setup -Create `.env` in project root: +1. Install Python 3.11+. +2. Install dependencies: + +```powershell +pip install -r requirements.txt +``` + +3. Create `.env` in project root: ```env OPENAI_API_KEY=... @@ -31,44 +74,50 @@ SCREENJOB_PORT=8787 DISABLE_UI=false ``` -## Entry Points +## Usage -- `python main.py run ""` -- `python main.py server` -- Backward-compatible wrapper: `python screenjob.py ""` - -## CLI Usage +### CLI ```powershell python main.py run "Open amazon.de and go to my orders" ``` -Useful flags: +CLI JSON output includes both legacy and structured fields: -- `--model gpt-5.4-mini` -- `--disable-tool click --disable-tool type` -- `--skip-safety-check` -- `--max-steps 80` +```json +{ + "completed": true, + "result": "Task completed successfully", + "response": { + "return": "Task completed successfully", + "data": "file1.txt\nfile2.txt" + }, + "return": "Task completed successfully", + "data": "file1.txt\nfile2.txt" +} +``` -## HTTP API +### Server -All API routes require token auth using `SCREENJOB_TOKEN`: +```powershell +python main.py server +``` -- `Authorization: Bearer ` or -- `X-ScreenJob-Token: ` -- (for browser/image fetch) `?token=` query parameter +Auth for all API routes: + +- `Authorization: Bearer ` +- `X-ScreenJob-Token: ` +- Query fallback `?token=` (mainly for UI/websocket/artifact fetch) ### Create Job `POST /api/jobs` -Body: - ```json { - "job": "Open amazon.de and go to my orders", + "job": "run \"ls -a\" in C:/Users/username/Documents and return output", "model": "gpt-5.4-mini", - "disabled_tools": ["click"], + "disabled_tools": [], "safety_override": false } ``` @@ -79,103 +128,68 @@ Response: { "job_id": "job_..." } ``` -### Status / Output +### Job Status / History -- `GET /api/jobs/{job_id}`: full status + output + live/final usage/cost -- `GET /api/jobs/{job_id}/status`: status alias -- `GET /api/jobs/{job_id}/events`: detailed timeline -- `GET /api/jobs/{job_id}/artifact?path=&token=`: authenticated artifact file fetch for screenshots/enhancements -- `GET /api/jobs`: list active + past jobs -- `POST /api/jobs/{job_id}/cancel`: graceful cancellation -- `GET /api/stats`: aggregate metrics +- `GET /api/jobs/{job_id}` +- `GET /api/jobs/{job_id}/status` +- `GET /api/jobs/{job_id}/events` +- `GET /api/jobs` +- `POST /api/jobs/{job_id}/cancel` +- `GET /api/stats` -## Monitoring UI +Each job payload includes: -- Served at `/` when `DISABLE_UI=false` -- Tailwind-based read-only dashboard -- Requires entering `SCREENJOB_TOKEN` in UI before data loads -- Uses WebSocket `/ws` for live updates (tool calls, step events, usage/cost updates) -- No task launch controls in UI (monitoring only) +- `result` (compat string) +- `response.return` +- `response.data` +- top-level `return` and `data` aliases -If `DISABLE_UI=true`, `/` returns `{ "ui_disabled": true }` and only API endpoints remain. +### Monitoring UI -## Safety +- URL: `/` +- Read-only dashboard (no run controls) +- Requires token input +- Live updates via `/ws` +- Set `DISABLE_UI=true` to disable UI -Before execution, each task is classified by a model safety gate: +## Agent Instructions (Practical) -- Safe: task runs -- Unsafe: task is rejected and recorded -- Override: set `safety_override=true` (or `--skip-safety-check` in CLI) +- Prefer `execute_command` for deterministic actions (opening URLs, filesystem checks). +- Use `see_screen` before UI interaction. +- Use `enhance` when text is unclear. +- Use `press_key` for non-text keys (Enter, Tab, arrows, Escape). +- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`. +- When done, call: + - `task_complete(return="...", data=...)` -## Tool Controls +`data` should contain useful structured output for the requester (text, object, list, etc.). -Per-job tool allowlisting via disable list: +## Docker Compose -- API: `disabled_tools: ["type", "click"]` -- CLI: `--disable-tool type --disable-tool click` +Run server in container: -Available tools: - -- `execute_command(command)` -- `sleep(seconds)` -- `see_screen()` -- `enhance(coordinate)` -- `click(coordinate, offset_up/down/left/right, sleep_after_seconds)` -- `type(text)` -- `press_key(key, repeats=1)` -- `task_complete(result)` - -## Cost Estimation - -Live/final cost is computed from OpenAI response usage (`input`, `cached_input`, `output`) and model pricing rates in `src/pricing.py`. - -- Live: exposed in `GET /api/jobs/{job_id}` during execution -- Final: persisted in SQLite and returned in status output - -## Persistence - -- SQLite DB: `screenjob.db` -- Runs/artifacts: `screenjob_runs/run_YYYYMMDD_HHMMSS/...` -- Full event log per job (for history and UI) - -## Project Layout - -```text -main.py -screenjob.py -src/ - __init__.py - agent.py - app_main.py - cli.py - config.py - models.py - pricing.py - runtime.py - safety.py - server.py - storage.py - task_manager.py - ui.py -tests/ - conftest.py - test_pricing.py - test_server_api.py - test_storage.py -.gitea/ - workflows/ - ci.yml +```powershell +docker compose up --build ``` +Service uses official Python image and reads `.env`. + ## Verification -Run local verification: +Local: ```powershell pytest -q ``` -Gitea CI pipeline: +CI: -- File: `.gitea/workflows/ci.yml` -- Runs compile checks + pytest on push and PR. +- `.gitea/workflows/ci.yml` runs compile checks + tests on push/PR. + +## Compatibility Entry Point + +- `python screenjob.py ""` remains supported as a wrapper to `main.py`. + +## License + +Apache License 2.0. See `LICENSE`. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..37eea4e --- /dev/null +++ b/SKILL.md @@ -0,0 +1,33 @@ +# ScreenJob Skill (OpenClaw Agents) + +## What ScreenJob Solves + +ScreenJob lets an agent execute tasks that require a real desktop UI plus terminal access, with structured tool calls and job tracking. + +## Main Features + +- Screen perception (`see_screen`, `enhance`) +- Mouse/keyboard control (`click`, `type`, `press_key`) +- Terminal execution (`execute_command`, `sleep`) +- Structured completion payload (`task_complete(return=..., data=...)`) +- Safety gate, auth, history, and live monitoring + +## Important Environment Note + +ScreenJob runs on a separate computer (the human/operator machine), not inside the agent's own runtime environment. + +## Why It Is Useful + +Agents can use ScreenJob to launch and control GUI workflows, including orchestrating other GUI agents/tools on a human computer. + +## Example Tasks + +- Open amazon.de and buy a USB-C to USB-C cable for 10 EUR or less. +- Open google.com, go to my account, and change my profile picture to a provided image URL. +- Run `ls -a` in `C:/Users/username/Documents` and return the output in `data`. + +## Practical Usage + +1. Submit job via CLI or API. +2. Agent performs tool loop. +3. Read final `response.return` and `response.data` from job status. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..68d694d --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,17 @@ +services: + screenjob: + image: python:3.11-slim + working_dir: /app + env_file: + - .env + environment: + SCREENJOB_HOST: 0.0.0.0 + SCREENJOB_PORT: 8787 + volumes: + - ./:/app + ports: + - "8787:8787" + command: > + sh -c "pip install --no-cache-dir -r requirements.txt && + python main.py server" + restart: unless-stopped diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a622889 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +fastapi>=0.115,<1.0 +httpx>=0.27,<1.0 +openai>=1.0.0 +pillow>=10.0.0 +pyautogui>=0.9.54 +pytest>=8.0.0 +python-dotenv>=1.0.0 +uvicorn>=0.30.0 diff --git a/src/agent.py b/src/agent.py index da1739c..a8b91f0 100644 --- a/src/agent.py +++ b/src/agent.py @@ -39,7 +39,8 @@ Rules: 7) You may call multiple tools in one step. If needed, do click then sleep. 8) Never spam repeated clicks on the same coordinate; switch strategy. 9) Keep tool arguments valid JSON and concise. -10) When objective is fully complete, call task_complete(result="..."). +10) When objective is fully complete, call task_complete(return="...", data=...). +11) The "data" field should contain structured output useful for the requester (for example command output text). """ @@ -69,6 +70,7 @@ class ScreenJobAgent: self.step = 0 self.completed = False self.final_result = "" + self.final_data: Any | None = None self.previous_response_id: str | None = None self.usage = UsageSummary() @@ -134,9 +136,11 @@ class ScreenJobAgent: "parameters": { "type": "object", "properties": { + "return": {"type": "string"}, "result": {"type": "string"}, + "data": {}, }, - "required": ["result"], + "required": [], "additionalProperties": False, }, }, @@ -551,10 +555,17 @@ class ScreenJobAgent: return {"ok": False, "command": command, "error": f"{type(exc).__name__}: {exc}"} def _tool_task_complete(self, args: dict[str, Any]) -> dict[str, Any]: - result = str(args.get("result", "")).strip() or "Task completed." + return_text = str(args.get("return", "")).strip() + if not return_text: + return_text = str(args.get("result", "")).strip() + if not return_text: + return_text = "Task completed." + + data = args.get("data") self.completed = True - self.final_result = result - return {"ok": True, "result": result} + self.final_result = return_text + self.final_data = data + return {"ok": True, "return": return_text, "data": data} def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]: if name in self.disabled_tools: @@ -620,7 +631,8 @@ class ScreenJobAgent: f"JOB: {job}\n" "You are in an action loop. Prefer execute_command for deterministic actions. " "You can return multiple tool calls in one step (example: click then sleep). " - "Call task_complete(result=...) only when truly done." + "When done call task_complete(return=..., data=...). " + "Include useful structured output in data." ), } ], @@ -672,7 +684,7 @@ class ScreenJobAgent: "text": ( "No function call was returned. Continue by using tools. " "You may call multiple tools in one step. " - "When complete, call task_complete(result=...)." + "When complete, call task_complete(return=..., data=...)." ), } ], @@ -746,6 +758,8 @@ class ScreenJobAgent: return AgentResult( completed=True, result=self.final_result, + return_message=self.final_result, + data=self.final_data, steps=self.step, started_at=started_at, ended_at=ended_at, @@ -758,6 +772,8 @@ class ScreenJobAgent: return AgentResult( completed=False, result="Cancelled by user request.", + return_message="Cancelled by user request.", + data=None, steps=self.step, started_at=started_at, ended_at=ended_at, @@ -772,6 +788,8 @@ class ScreenJobAgent: return AgentResult( completed=False, result=error_text, + return_message=error_text, + data=None, steps=self.step, started_at=started_at, ended_at=ended_at, @@ -785,6 +803,8 @@ class ScreenJobAgent: return AgentResult( completed=False, result=result_text, + return_message=result_text, + data=None, steps=self.step, started_at=started_at, ended_at=ended_at, diff --git a/src/cli.py b/src/cli.py index c0cbe85..bf379ea 100644 --- a/src/cli.py +++ b/src/cli.py @@ -61,6 +61,9 @@ def main(argv: list[str] | None = None) -> int: { "completed": False, "result": f"Blocked by safety check: {reason}", + "response": {"return": f"Blocked by safety check: {reason}", "data": parsed}, + "return": f"Blocked by safety check: {reason}", + "data": parsed, "safety": parsed, }, ensure_ascii=False, @@ -101,7 +104,10 @@ def main(argv: list[str] | None = None) -> int: payload = { "completed": result.completed, - "result": result.result, + "result": result.return_message, + "response": {"return": result.return_message, "data": result.data}, + "return": result.return_message, + "data": result.data, "steps": result.steps, "elapsed_seconds": round(result.ended_at - result.started_at, 3), "artifacts_dir": str(artifacts.root_dir.resolve()), @@ -111,4 +117,3 @@ def main(argv: list[str] | None = None) -> int: } print(json.dumps(payload, ensure_ascii=False, indent=2)) return 0 if result.completed else 1 - diff --git a/src/models.py b/src/models.py index 6528cbe..7a55fca 100644 --- a/src/models.py +++ b/src/models.py @@ -19,6 +19,8 @@ class RunArtifacts: class AgentResult: completed: bool result: str + return_message: str + data: Any | None steps: int started_at: float ended_at: float diff --git a/src/storage.py b/src/storage.py index c4ab525..3abd7eb 100644 --- a/src/storage.py +++ b/src/storage.py @@ -31,6 +31,7 @@ class HistoryDB: started_at TEXT, ended_at TEXT, result TEXT, + response_json TEXT, error TEXT, steps INTEGER DEFAULT 0, cancelled INTEGER DEFAULT 0, @@ -65,6 +66,9 @@ class HistoryDB: conn.execute( "CREATE INDEX IF NOT EXISTS idx_job_events_job_id_id ON job_events(job_id, id)" ) + columns = {row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()} + if "response_json" not in columns: + conn.execute("ALTER TABLE jobs ADD COLUMN response_json TEXT") conn.commit() def create_job( @@ -195,6 +199,7 @@ class HistoryDB: "started_at": row["started_at"], "ended_at": row["ended_at"], "result": row["result"], + "response": self._parse_response_payload(row["response_json"], row["result"]), "error": row["error"], "steps": row["steps"], "cancelled": bool(row["cancelled"]), @@ -214,3 +219,17 @@ class HistoryDB: }, } + def _parse_response_payload(self, response_json: str | None, result: str | None) -> dict[str, Any]: + fallback_return = str(result or "").strip() + if not response_json: + return {"return": fallback_return, "data": None} + try: + payload = json.loads(response_json) + if isinstance(payload, dict): + return { + "return": str(payload.get("return") or fallback_return), + "data": payload.get("data"), + } + except Exception: + pass + return {"return": fallback_return, "data": None} diff --git a/src/task_manager.py b/src/task_manager.py index da97947..3d8cc42 100644 --- a/src/task_manager.py +++ b/src/task_manager.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import threading import time import uuid @@ -159,6 +160,7 @@ class JobManager: ended_at=ended_at, error=error_text, result=error_text, + response_json=json.dumps({"return": error_text, "data": None}, ensure_ascii=False), ) self._publish( job_id, @@ -237,6 +239,7 @@ class JobManager: ended_at=ended_at, error=err, result=err, + response_json=json.dumps({"return": err, "data": None}, ensure_ascii=False), ) self._publish(job_id, {"ts": ended_at, "step": 0, "event_type": "job_failed", "payload": {"error": err}}) with self._lock: @@ -251,7 +254,14 @@ class JobManager: job_id, status=status, ended_at=ended_at, - result=result.result, + result=result.return_message, + response_json=json.dumps( + { + "return": result.return_message, + "data": result.data, + }, + ensure_ascii=False, + ), error=result.error, steps=result.steps, cancelled=1 if result.cancelled else 0, @@ -271,7 +281,8 @@ class JobManager: "event_type": "job_finished", "payload": { "status": status, - "result": result.result, + "result": result.return_message, + "response": {"return": result.return_message, "data": result.data}, "error": result.error, "cancelled": result.cancelled, "usage": result.usage.to_dict(), @@ -318,10 +329,10 @@ class JobManager: job["is_running_thread"] = live.thread.is_alive() else: job["is_running_thread"] = False - return job + return self._normalize_job_payload(job) def list_jobs(self, limit: int = 100) -> list[dict[str, Any]]: - return self.db.list_jobs(limit=limit) + return [self._normalize_job_payload(job) for job in self.db.list_jobs(limit=limit)] def get_events(self, job_id: str, limit: int = 500) -> list[dict[str, Any]]: return self.db.get_job_events(job_id, limit=limit) @@ -331,3 +342,12 @@ class JobManager: with self._lock: stats["live_running_threads"] = sum(1 for job in self._running.values() if job.thread.is_alive()) return stats + + def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]: + response = job.get("response") + if not isinstance(response, dict): + response = {"return": str(job.get("result") or ""), "data": None} + job["response"] = response + job["return"] = str(response.get("return") or "") + job["data"] = response.get("data") + return job diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py new file mode 100644 index 0000000..22b4932 --- /dev/null +++ b/tests/test_agent_tools.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import logging +from pathlib import Path + +from PIL import Image + +import src.agent as agent_module +from src.models import RunArtifacts, RuntimeOptions + + +class _DummyPyAutoGUI: + FAILSAFE = True + PAUSE = 0.0 + + def __init__(self) -> None: + self.last_move_to: tuple[int, int] | None = None + self.last_click: tuple[int, int] | None = None + + def screenshot(self) -> Image.Image: + return Image.new("RGB", (1280, 720), color=(24, 24, 24)) + + def size(self) -> tuple[int, int]: + return (1280, 720) + + def moveTo(self, x: int, y: int, duration: float = 0.0) -> None: # noqa: N802 + self.last_move_to = (x, y) + + def click(self, x: int, y: int) -> None: + self.last_click = (x, y) + + def write(self, _: str, interval: float = 0.0) -> None: + return None + + def press(self, _: str) -> None: + return None + + +def _build_agent(tmp_path: Path, monkeypatch) -> agent_module.ScreenJobAgent: + dummy_gui = _DummyPyAutoGUI() + monkeypatch.setattr(agent_module, "pyautogui", dummy_gui) + monkeypatch.setattr(agent_module.time, "sleep", lambda _: None) + + run_dir = tmp_path / "run" + run_dir.mkdir(parents=True, exist_ok=True) + artifacts = RunArtifacts( + run_id="test_run", + root_dir=run_dir, + logs_dir=run_dir / "logs", + shots_dir=run_dir / "shots", + enhance_dir=run_dir / "enhance", + log_file=run_dir / "screenjob.log", + ) + options = RuntimeOptions(model="gpt-5.4-mini") + logger = logging.getLogger("screenjob-test-agent") + return agent_module.ScreenJobAgent( + client=object(), # type: ignore[arg-type] + logger=logger, + artifacts=artifacts, + options=options, + ) + + +def test_task_complete_captures_return_and_data(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + result = agent._tool_task_complete({"return": "Task completed successfully", "data": "file1\nfile2"}) + assert result["ok"] is True + assert result["return"] == "Task completed successfully" + assert result["data"] == "file1\nfile2" + assert agent.final_result == "Task completed successfully" + assert agent.final_data == "file1\nfile2" + + +def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + click_result = agent._tool_click( + { + "coordinate": {"x": 100, "y": 100}, + "offset_up": "2px", + "offset_right": 7, + "offset": {"x": 3, "y": 4}, + "sleep_after_seconds": 0, + } + ) + assert click_result["ok"] is True + assert click_result["clicked"] == {"x": 110, "y": 102} diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..f058a97 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import src.cli as cli_module +from src.config import AppConfig +from src.models import AgentResult, RunArtifacts, UsageSummary + + +def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path: Path) -> None: + config = AppConfig( + openai_api_key="test_key", + screenjob_token="test_token", + disable_ui=False, + default_model="gpt-5.4-mini", + safety_model="gpt-5.4-mini", + host="127.0.0.1", + port=8787, + runs_dir=tmp_path / "runs", + db_path=tmp_path / "screenjob.db", + ) + config.runs_dir.mkdir(parents=True, exist_ok=True) + + def fake_load_app_config(_: Path) -> AppConfig: + return config + + def fake_assess_task_safety(*_args, **_kwargs): + return True, "safe", {"safe": True} + + def fake_run_job(*_args, **_kwargs): + result = AgentResult( + completed=True, + result="Done", + return_message="Task completed successfully", + data="file1.txt\nfile2.txt", + steps=3, + started_at=10.0, + ended_at=12.5, + usage=UsageSummary(total_tokens=123), + error=None, + cancelled=False, + ) + artifacts = RunArtifacts( + run_id="20260527_000001", + root_dir=config.runs_dir / "run_20260527_000001", + logs_dir=config.runs_dir / "run_20260527_000001" / "logs", + shots_dir=config.runs_dir / "run_20260527_000001" / "shots", + enhance_dir=config.runs_dir / "run_20260527_000001" / "enhance", + log_file=config.runs_dir / "run_20260527_000001" / "screenjob.log", + ) + return result, artifacts + + monkeypatch.setattr(cli_module, "load_app_config", fake_load_app_config) + monkeypatch.setattr(cli_module, "assess_task_safety", fake_assess_task_safety) + monkeypatch.setattr(cli_module, "run_job", fake_run_job) + monkeypatch.setattr(cli_module, "create_openai_client", lambda *_args, **_kwargs: object()) + + code = cli_module.main(["Open amazon.de"]) + assert code == 0 + + out = capsys.readouterr().out + payload = json.loads(out) + assert payload["response"]["return"] == "Task completed successfully" + assert payload["response"]["data"] == "file1.txt\nfile2.txt" + assert payload["return"] == "Task completed successfully" + assert payload["data"] == "file1.txt\nfile2.txt" diff --git a/tests/test_server_api.py b/tests/test_server_api.py index 88afbf7..a79c89d 100644 --- a/tests/test_server_api.py +++ b/tests/test_server_api.py @@ -49,6 +49,10 @@ class FakeJobManager: "objective": objective, "model": selected_model, "status": "running", + "result": "Running", + "response": {"return": "Running", "data": None}, + "return": "Running", + "data": None, "usage": { "input_tokens": 10, "cached_input_tokens": 2, @@ -145,6 +149,8 @@ def test_create_job_returns_only_job_id_and_defaults_model(tmp_path: Path, monke status_res = client.get(f"/api/jobs/{job_id}/status", headers=headers) assert status_res.status_code == 200 assert status_res.json()["job_id"] == job_id + assert status_res.json()["response"]["return"] == "Running" + assert "data" in status_res.json()["response"] def test_cancel_endpoint_and_events(tmp_path: Path, monkeypatch: Any) -> None: @@ -164,6 +170,8 @@ def test_cancel_endpoint_and_events(tmp_path: Path, monkeypatch: Any) -> None: status_after = client.get(f"/api/jobs/{job_id}", headers=headers).json() assert status_after["status"] == "cancelling" + assert status_after["return"] == "Running" + assert status_after["data"] is None def test_ui_toggle(tmp_path: Path, monkeypatch: Any) -> None: @@ -178,4 +186,3 @@ def test_ui_toggle(tmp_path: Path, monkeypatch: Any) -> None: root_disabled = client_disabled.get("/") assert root_disabled.status_code == 200 assert root_disabled.json()["ui_disabled"] is True - diff --git a/tests/test_storage.py b/tests/test_storage.py index 40d38f5..155ac7f 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1,4 +1,5 @@ from pathlib import Path +import json from src.storage import HistoryDB @@ -26,6 +27,7 @@ def test_history_db_job_and_events_roundtrip(tmp_path: Path) -> None: status="completed", ended_at="2026-05-27T00:00:02Z", result="Done", + response_json=json.dumps({"return": "Done", "data": {"files": ["a.txt", "b.txt"]}}, ensure_ascii=False), steps=2, estimated_cost_usd=0.1234, ) @@ -35,6 +37,8 @@ def test_history_db_job_and_events_roundtrip(tmp_path: Path) -> None: assert job["status"] == "completed" assert job["model"] == "gpt-5.4-mini" assert job["disabled_tools"] == ["click"] + assert job["response"]["return"] == "Done" + assert job["response"]["data"]["files"] == ["a.txt", "b.txt"] assert job["usage"]["estimated_cost_usd"] == 0.1234 events = db.get_job_events(job_id, limit=10) @@ -51,3 +55,20 @@ def test_history_db_job_and_events_roundtrip(tmp_path: Path) -> None: assert stats["completed_jobs"] == 1 assert abs(stats["total_estimated_cost"] - 0.1234) < 1e-9 + +def test_storage_response_fallback_uses_result_when_json_missing(tmp_path: Path) -> None: + db = HistoryDB(tmp_path / "screenjob_test_fallback.db") + job_id = "job_test_002" + db.create_job( + job_id=job_id, + objective="Fallback check", + model="gpt-5.4-mini", + created_at="2026-05-27T00:00:00Z", + safety_override=False, + disabled_tools=[], + ) + db.update_job(job_id, status="completed", result="Legacy result string") + job = db.get_job(job_id) + assert job is not None + assert job["response"]["return"] == "Legacy result string" + assert job["response"]["data"] is None