diff --git a/.gitignore b/.gitignore index d2c46fa..9d600ed 100644 --- a/.gitignore +++ b/.gitignore @@ -15,8 +15,8 @@ env/ # Runtime artifacts screenjob_runs/ result.json +screenjob.db # IDE .vscode/ .idea/ - diff --git a/README.md b/README.md index b98ea69..0016d65 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,123 @@ # ScreenJob -Single-file behavior, split into maintainable modules under `src/`. +Desktop-and-terminal task agent with: -## Entry point - -- Primary: `python main.py ""` -- Backward compatible: `python screenjob.py ""` +- CLI runner +- FastAPI job server +- SQLite task history +- WebSocket-powered monitoring UI +- Safety pre-check and per-job tool disable controls +- Live/final token and cost estimation ## Install ```powershell -pip install openai pillow pyautogui python-dotenv +pip install openai pillow pyautogui python-dotenv fastapi uvicorn ``` -## Configure +## Environment -Create a `.env` file in project root: +Create `.env` in project root: ```env -OPENAI_API_KEY=your_key_here +OPENAI_API_KEY=... +SCREENJOB_TOKEN=choose_a_strong_token + +# Optional +SCREENJOB_DEFAULT_MODEL=gpt-5.4-mini +SCREENJOB_SAFETY_MODEL=gpt-5.4-mini +SCREENJOB_HOST=127.0.0.1 +SCREENJOB_PORT=8787 +DISABLE_UI=false ``` -## Usage +## Entry Points + +- `python main.py run ""` +- `python main.py server` +- Backward-compatible wrapper: `python screenjob.py ""` + +## CLI Usage ```powershell -python main.py "Open amazon.de and go to my orders" +python main.py run "Open amazon.de and go to my orders" ``` -Optional flags: +Useful flags: -```powershell -python main.py "Open amazon.de" --model gpt-5.2 --max-steps 80 +- `--model gpt-5.4-mini` +- `--disable-tool click --disable-tool type` +- `--skip-safety-check` +- `--max-steps 80` + +## HTTP API + +All API routes require token auth using `SCREENJOB_TOKEN`: + +- `Authorization: Bearer ` or +- `X-ScreenJob-Token: ` +- (for browser/image fetch) `?token=` query parameter + +### Create Job + +`POST /api/jobs` + +Body: + +```json +{ + "job": "Open amazon.de and go to my orders", + "model": "gpt-5.4-mini", + "disabled_tools": ["click"], + "safety_override": false +} ``` -## Tools exposed to the model +Response: + +```json +{ "job_id": "job_..." } +``` + +### Status / Output + +- `GET /api/jobs/{job_id}`: full status + output + live/final usage/cost +- `GET /api/jobs/{job_id}/status`: status alias +- `GET /api/jobs/{job_id}/events`: detailed timeline +- `GET /api/jobs/{job_id}/artifact?path=&token=`: authenticated artifact file fetch for screenshots/enhancements +- `GET /api/jobs`: list active + past jobs +- `POST /api/jobs/{job_id}/cancel`: graceful cancellation +- `GET /api/stats`: aggregate metrics + +## Monitoring UI + +- Served at `/` when `DISABLE_UI=false` +- Tailwind-based read-only dashboard +- Requires entering `SCREENJOB_TOKEN` in UI before data loads +- Uses WebSocket `/ws` for live updates (tool calls, step events, usage/cost updates) +- No task launch controls in UI (monitoring only) + +If `DISABLE_UI=true`, `/` returns `{ "ui_disabled": true }` and only API endpoints remain. + +## Safety + +Before execution, each task is classified by a model safety gate: + +- Safe: task runs +- Unsafe: task is rejected and recorded +- Override: set `safety_override=true` (or `--skip-safety-check` in CLI) + +## Tool Controls + +Per-job tool allowlisting via disable list: + +- API: `disabled_tools: ["type", "click"]` +- CLI: `--disable-tool type --disable-tool click` + +Available tools: - `execute_command(command)` -- `sleep(seconds)` (replaces shell-based sleep calls) +- `sleep(seconds)` - `see_screen()` - `enhance(coordinate)` - `click(coordinate, offset_up/down/left/right, sleep_after_seconds)` @@ -44,51 +125,36 @@ python main.py "Open amazon.de" --model gpt-5.2 --max-steps 80 - `press_key(key, repeats=1)` - `task_complete(result)` -### Offset examples +## Cost Estimation -- `{"coordinate":{"x":1000,"y":500},"offset_up":"2px"}` -- `{"coordinate":{"x":1000,"y":500},"offset_right":4}` +Live/final cost is computed from OpenAI response usage (`input`, `cached_input`, `output`) and model pricing rates in `src/pricing.py`. -### Multi-tool calls in one step +- Live: exposed in `GET /api/jobs/{job_id}` during execution +- Final: persisted in SQLite and returned in status output -The agent supports multiple tool calls in a single model response and executes them in order. -Example sequence in one step: +## Persistence -1. `click(...)` -2. `sleep({"seconds": 1.5})` +- SQLite DB: `screenjob.db` +- Runs/artifacts: `screenjob_runs/run_YYYYMMDD_HHMMSS/...` +- Full event log per job (for history and UI) -You can also use `click(..., sleep_after_seconds=1.5)` for a one-call variant. - -## Output - -Each run creates: - -- `screenjob_runs/run_YYYYMMDD_HHMMSS/logs/screenjob.log` -- `screenjob_runs/run_YYYYMMDD_HHMMSS/screens/*.png` -- `screenjob_runs/run_YYYYMMDD_HHMMSS/enhanced/*.png` - -Final stdout is JSON: - -```json -{ - "completed": true, - "result": "...", - "steps": 13, - "elapsed_seconds": 59.691, - "artifacts_dir": "C:\\...\\screenjob_runs\\run_..." -} -``` - -## Project layout +## Project Layout ```text main.py screenjob.py src/ __init__.py - cli.py agent.py + app_main.py + cli.py + config.py models.py - utils.py + pricing.py + runtime.py + safety.py + server.py + storage.py + task_manager.py + ui.py ``` - diff --git a/src/__init__.py b/src/__init__.py index 6e6874d..4987b02 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1 +1 @@ -# Root package marker for local imports like: from src.cli import main +# Root package marker for local imports. diff --git a/src/server.py b/src/server.py index 3b796cd..588079a 100644 --- a/src/server.py +++ b/src/server.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Any from fastapi import Depends, FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect +from fastapi.responses import FileResponse from fastapi.responses import HTMLResponse, JSONResponse from pydantic import BaseModel, Field @@ -86,7 +87,13 @@ def create_app(config: AppConfig | None = None) -> FastAPI: async def _on_startup() -> None: ws_hub.set_loop(asyncio.get_running_loop()) - def _extract_token(authorization: str | None, x_screenjob_token: str | None) -> str: + def _extract_token( + authorization: str | None, + x_screenjob_token: str | None, + query_token: str | None, + ) -> str: + if query_token: + return query_token.strip() if x_screenjob_token: return x_screenjob_token.strip() if authorization: @@ -99,9 +106,10 @@ def create_app(config: AppConfig | None = None) -> FastAPI: def require_token( authorization: str | None = Header(default=None), x_screenjob_token: str | None = Header(default=None), + token: str | None = Query(default=None), ) -> None: - token = _extract_token(authorization, x_screenjob_token) - if not token or not secrets.compare_digest(token, app_config.screenjob_token): + resolved = _extract_token(authorization, x_screenjob_token, token) + if not resolved or not secrets.compare_digest(resolved, app_config.screenjob_token): raise HTTPException(status_code=401, detail="Unauthorized") @app.post("/api/jobs") @@ -130,6 +138,13 @@ def create_app(config: AppConfig | None = None) -> FastAPI: raise HTTPException(status_code=404, detail="Job not found") return job + @app.get("/api/jobs/{job_id}/status") + def get_job_status(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]: + job = manager.get_job(job_id) + if job is None: + raise HTTPException(status_code=404, detail="Job not found") + return job + @app.get("/api/jobs/{job_id}/events") def get_job_events( job_id: str, @@ -149,6 +164,28 @@ def create_app(config: AppConfig | None = None) -> FastAPI: accepted = manager.cancel_job(job_id) return {"job_id": job_id, "cancel_requested": bool(accepted)} + @app.get("/api/jobs/{job_id}/artifact") + def get_job_artifact( + job_id: str, + path: str = Query(..., min_length=1), + _: None = Depends(require_token), + ) -> FileResponse: + job = manager.get_job(job_id) + if job is None: + raise HTTPException(status_code=404, detail="Job not found") + artifacts_dir_raw = str(job.get("artifacts_dir") or "").strip() + if not artifacts_dir_raw: + raise HTTPException(status_code=404, detail="Artifacts not available yet") + artifacts_dir = Path(artifacts_dir_raw).resolve() + requested = Path(path).resolve() + try: + requested.relative_to(artifacts_dir) + except ValueError as exc: + raise HTTPException(status_code=400, detail="Artifact path is outside job artifacts directory") from exc + if not requested.exists() or not requested.is_file(): + raise HTTPException(status_code=404, detail="Artifact not found") + return FileResponse(str(requested)) + @app.get("/api/stats") def stats(_: None = Depends(require_token)) -> dict[str, Any]: return manager.stats() diff --git a/src/task_manager.py b/src/task_manager.py index 0db25ef..da97947 100644 --- a/src/task_manager.py +++ b/src/task_manager.py @@ -191,6 +191,13 @@ class JobManager: def on_event(event: dict[str, Any]) -> None: self._publish(job_id, event) + if event.get("event_type") == "job_started": + run_id = str(((event.get("payload") or {}).get("run_id") or "")).strip() + if run_id: + self.db.update_job( + job_id, + artifacts_dir=str((self.config.runs_dir / f"run_{run_id}").resolve()), + ) if event.get("event_type") == "usage_update": usage = (event.get("payload") or {}).get("usage") or {} self.db.update_job( diff --git a/src/ui.py b/src/ui.py index 96a11c6..1da1ded 100644 --- a/src/ui.py +++ b/src/ui.py @@ -37,6 +37,10 @@ def monitoring_page_html() -> str:

Job Detail


+        

Latest Visual

+
+ Latest visual update +

Live Events

@@ -51,6 +55,7 @@ def monitoring_page_html() -> str: const jobDetailEl = document.getElementById("jobDetail"); const eventsEl = document.getElementById("events"); const statsEl = document.getElementById("stats"); + const latestVisualEl = document.getElementById("latestVisual"); const state = { token: localStorage.getItem("screenjob_token") || "", @@ -123,6 +128,15 @@ def monitoring_page_html() -> str: } } + function updateLatestVisualFromEvent(ev) { + if (!ev || ev.event_type !== "visual_update") return; + if (!state.selectedJobId || ev.job_id !== state.selectedJobId) return; + const imagePath = ev.payload && ev.payload.image_meta && ev.payload.image_meta.path; + if (!imagePath) return; + const q = encodeURIComponent(imagePath); + latestVisualEl.src = `/api/jobs/${state.selectedJobId}/artifact?path=${q}&token=${encodeURIComponent(state.token)}`; + } + async function refreshJobs() { const payload = await api("/api/jobs?limit=100"); state.jobs = payload.jobs || []; @@ -143,7 +157,10 @@ def monitoring_page_html() -> str: ]); jobDetailEl.textContent = JSON.stringify(job, null, 2); eventsEl.innerHTML = ""; - for (const ev of (events.events || []).slice().reverse()) pushEventLine(ev); + const list = (events.events || []).slice().reverse(); + for (const ev of list) pushEventLine(ev); + const visual = list.find((ev) => ev.event_type === "visual_update"); + if (visual) updateLatestVisualFromEvent(visual); } function connectWs() { @@ -158,6 +175,7 @@ def monitoring_page_html() -> str: try { const payload = JSON.parse(event.data); pushEventLine(payload); + updateLatestVisualFromEvent(payload); if (!state.selectedJobId || payload.job_id === state.selectedJobId) { await refreshJobDetail(); } @@ -190,4 +208,3 @@ def monitoring_page_html() -> str: """ -