feat: add authenticated artifact streaming and UI visual previews

2026-05-27 17:50:21 +02:00
parent 10355bf11a
commit 8fe6ad2d75
6 changed files with 184 additions and 57 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -15,8 +15,8 @@ env/
 # Runtime artifacts
 screenjob_runs/
 result.json
+screenjob.db

 # IDE
 .vscode/
 .idea/
-
--- a/README.md
+++ b/README.md
@@ -1,42 +1,123 @@
 # ScreenJob

-Single-file behavior, split into maintainable modules under `src/`.
+Desktop-and-terminal task agent with:

-## Entry point
-
- Primary: `python main.py "<task>"`
- Backward compatible: `python screenjob.py "<task>"`
+- CLI runner
+- FastAPI job server
+- SQLite task history
+- WebSocket-powered monitoring UI
+- Safety pre-check and per-job tool disable controls
+- Live/final token and cost estimation

 ## Install

 ```powershell
-pip install openai pillow pyautogui python-dotenv
+pip install openai pillow pyautogui python-dotenv fastapi uvicorn
 ```

-## Configure
+## Environment

-Create a `.env` file in project root:
+Create `.env` in project root:

 ```env
-OPENAI_API_KEY=your_key_here
+OPENAI_API_KEY=...
+SCREENJOB_TOKEN=choose_a_strong_token
+
+# Optional
+SCREENJOB_DEFAULT_MODEL=gpt-5.4-mini
+SCREENJOB_SAFETY_MODEL=gpt-5.4-mini
+SCREENJOB_HOST=127.0.0.1
+SCREENJOB_PORT=8787
+DISABLE_UI=false
 ```

-## Usage
+## Entry Points
+
+- `python main.py run "<job>"`
+- `python main.py server`
+- Backward-compatible wrapper: `python screenjob.py "<job>"`
+
+## CLI Usage

 ```powershell
-python main.py "Open amazon.de and go to my orders"
+python main.py run "Open amazon.de and go to my orders"
 ```

-Optional flags:
+Useful flags:

-```powershell
-python main.py "Open amazon.de" --model gpt-5.2 --max-steps 80
+- `--model gpt-5.4-mini`
+- `--disable-tool click --disable-tool type`
+- `--skip-safety-check`
+- `--max-steps 80`
+
+## HTTP API
+
+All API routes require token auth using `SCREENJOB_TOKEN`:
+
+- `Authorization: Bearer <token>` or
+- `X-ScreenJob-Token: <token>`
+- (for browser/image fetch) `?token=<token>` query parameter
+
+### Create Job
+
+`POST /api/jobs`
+
+Body:
+
+```json
+{
+  "job": "Open amazon.de and go to my orders",
+  "model": "gpt-5.4-mini",
+  "disabled_tools": ["click"],
+  "safety_override": false
+}
 ```

-## Tools exposed to the model
+Response:
+
+```json
+{ "job_id": "job_..." }
+```
+
+### Status / Output
+
+- `GET /api/jobs/{job_id}`: full status + output + live/final usage/cost
+- `GET /api/jobs/{job_id}/status`: status alias
+- `GET /api/jobs/{job_id}/events`: detailed timeline
+- `GET /api/jobs/{job_id}/artifact?path=<absolute_path>&token=<token>`: authenticated artifact file fetch for screenshots/enhancements
+- `GET /api/jobs`: list active + past jobs
+- `POST /api/jobs/{job_id}/cancel`: graceful cancellation
+- `GET /api/stats`: aggregate metrics
+
+## Monitoring UI
+
+- Served at `/` when `DISABLE_UI=false`
+- Tailwind-based read-only dashboard
+- Requires entering `SCREENJOB_TOKEN` in UI before data loads
+- Uses WebSocket `/ws` for live updates (tool calls, step events, usage/cost updates)
+- No task launch controls in UI (monitoring only)
+
+If `DISABLE_UI=true`, `/` returns `{ "ui_disabled": true }` and only API endpoints remain.
+
+## Safety
+
+Before execution, each task is classified by a model safety gate:
+
+- Safe: task runs
+- Unsafe: task is rejected and recorded
+- Override: set `safety_override=true` (or `--skip-safety-check` in CLI)
+
+## Tool Controls
+
+Per-job tool allowlisting via disable list:
+
+- API: `disabled_tools: ["type", "click"]`
+- CLI: `--disable-tool type --disable-tool click`
+
+Available tools:

 - `execute_command(command)`
- `sleep(seconds)` (replaces shell-based sleep calls)
+- `sleep(seconds)`
 - `see_screen()`
 - `enhance(coordinate)`
 - `click(coordinate, offset_up/down/left/right, sleep_after_seconds)`
@@ -44,51 +125,36 @@ python main.py "Open amazon.de" --model gpt-5.2 --max-steps 80
 - `press_key(key, repeats=1)`
 - `task_complete(result)`

-### Offset examples
+## Cost Estimation

- `{"coordinate":{"x":1000,"y":500},"offset_up":"2px"}`
- `{"coordinate":{"x":1000,"y":500},"offset_right":4}`
+Live/final cost is computed from OpenAI response usage (`input`, `cached_input`, `output`) and model pricing rates in `src/pricing.py`.

-### Multi-tool calls in one step
+- Live: exposed in `GET /api/jobs/{job_id}` during execution
+- Final: persisted in SQLite and returned in status output

-The agent supports multiple tool calls in a single model response and executes them in order.  
-Example sequence in one step:
+## Persistence

-1. `click(...)`
-2. `sleep({"seconds": 1.5})`
+- SQLite DB: `screenjob.db`
+- Runs/artifacts: `screenjob_runs/run_YYYYMMDD_HHMMSS/...`
+- Full event log per job (for history and UI)

-You can also use `click(..., sleep_after_seconds=1.5)` for a one-call variant.
-
-## Output
-
-Each run creates:
-
- `screenjob_runs/run_YYYYMMDD_HHMMSS/logs/screenjob.log`
- `screenjob_runs/run_YYYYMMDD_HHMMSS/screens/*.png`
- `screenjob_runs/run_YYYYMMDD_HHMMSS/enhanced/*.png`
-
-Final stdout is JSON:
-
-```json
-{
-  "completed": true,
-  "result": "...",
-  "steps": 13,
-  "elapsed_seconds": 59.691,
-  "artifacts_dir": "C:\\...\\screenjob_runs\\run_..."
-}
-```
-
-## Project layout
+## Project Layout

 ```text
 main.py
 screenjob.py
 src/
  __init__.py
-  cli.py
  agent.py
+  app_main.py
+  cli.py
+  config.py
  models.py
-  utils.py
+  pricing.py
+  runtime.py
+  safety.py
+  server.py
+  storage.py
+  task_manager.py
+  ui.py
 ```
-
--- a/src/init.py
+++ b/src/init.py
@@ -1 +1 @@
-# Root package marker for local imports like: from src.cli import main
+# Root package marker for local imports.
--- a/src/server.py
+++ b/src/server.py
@@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Any

 from fastapi import Depends, FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect
+from fastapi.responses import FileResponse
 from fastapi.responses import HTMLResponse, JSONResponse
 from pydantic import BaseModel, Field

@@ -86,7 +87,13 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
    async def _on_startup() -> None:
        ws_hub.set_loop(asyncio.get_running_loop())

-    def _extract_token(authorization: str | None, x_screenjob_token: str | None) -> str:
+    def _extract_token(
+        authorization: str | None,
+        x_screenjob_token: str | None,
+        query_token: str | None,
+    ) -> str:
+        if query_token:
+            return query_token.strip()
        if x_screenjob_token:
            return x_screenjob_token.strip()
        if authorization:
@@ -99,9 +106,10 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
    def require_token(
        authorization: str | None = Header(default=None),
        x_screenjob_token: str | None = Header(default=None),
+        token: str | None = Query(default=None),
    ) -> None:
-        token = _extract_token(authorization, x_screenjob_token)
-        if not token or not secrets.compare_digest(token, app_config.screenjob_token):
+        resolved = _extract_token(authorization, x_screenjob_token, token)
+        if not resolved or not secrets.compare_digest(resolved, app_config.screenjob_token):
            raise HTTPException(status_code=401, detail="Unauthorized")

    @app.post("/api/jobs")
@@ -130,6 +138,13 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
            raise HTTPException(status_code=404, detail="Job not found")
        return job

+    @app.get("/api/jobs/{job_id}/status")
+    def get_job_status(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]:
+        job = manager.get_job(job_id)
+        if job is None:
+            raise HTTPException(status_code=404, detail="Job not found")
+        return job
+
    @app.get("/api/jobs/{job_id}/events")
    def get_job_events(
        job_id: str,
@@ -149,6 +164,28 @@ def create_app(config: AppConfig | None = None) -> FastAPI:
        accepted = manager.cancel_job(job_id)
        return {"job_id": job_id, "cancel_requested": bool(accepted)}

+    @app.get("/api/jobs/{job_id}/artifact")
+    def get_job_artifact(
+        job_id: str,
+        path: str = Query(..., min_length=1),
+        _: None = Depends(require_token),
+    ) -> FileResponse:
+        job = manager.get_job(job_id)
+        if job is None:
+            raise HTTPException(status_code=404, detail="Job not found")
+        artifacts_dir_raw = str(job.get("artifacts_dir") or "").strip()
+        if not artifacts_dir_raw:
+            raise HTTPException(status_code=404, detail="Artifacts not available yet")
+        artifacts_dir = Path(artifacts_dir_raw).resolve()
+        requested = Path(path).resolve()
+        try:
+            requested.relative_to(artifacts_dir)
+        except ValueError as exc:
+            raise HTTPException(status_code=400, detail="Artifact path is outside job artifacts directory") from exc
+        if not requested.exists() or not requested.is_file():
+            raise HTTPException(status_code=404, detail="Artifact not found")
+        return FileResponse(str(requested))
+
    @app.get("/api/stats")
    def stats(_: None = Depends(require_token)) -> dict[str, Any]:
        return manager.stats()
--- a/src/task_manager.py
+++ b/src/task_manager.py
@@ -191,6 +191,13 @@ class JobManager:

        def on_event(event: dict[str, Any]) -> None:
            self._publish(job_id, event)
+            if event.get("event_type") == "job_started":
+                run_id = str(((event.get("payload") or {}).get("run_id") or "")).strip()
+                if run_id:
+                    self.db.update_job(
+                        job_id,
+                        artifacts_dir=str((self.config.runs_dir / f"run_{run_id}").resolve()),
+                    )
            if event.get("event_type") == "usage_update":
                usage = (event.get("payload") or {}).get("usage") or {}
                self.db.update_job(
--- a/src/ui.py
+++ b/src/ui.py
@@ -37,6 +37,10 @@ def monitoring_page_html() -> str:
      <div class="lg:col-span-3 bg-slate-900/70 border border-slate-800 rounded-xl p-4 space-y-3">
        <h2 class="font-semibold">Job Detail</h2>
        <pre id="jobDetail" class="bg-slate-950 border border-slate-800 rounded p-3 text-xs overflow-auto max-h-[24vh]"></pre>
+        <h3 class="font-semibold text-sm">Latest Visual</h3>
+        <div class="bg-slate-950 border border-slate-800 rounded p-2">
+          <img id="latestVisual" alt="Latest visual update" class="max-h-[24vh] w-full object-contain rounded" />
+        </div>
        <h3 class="font-semibold text-sm">Live Events</h3>
        <div id="events" class="bg-slate-950 border border-slate-800 rounded p-3 text-xs overflow-auto max-h-[36vh] space-y-1"></div>
      </div>
@@ -51,6 +55,7 @@ def monitoring_page_html() -> str:
    const jobDetailEl = document.getElementById("jobDetail");
    const eventsEl = document.getElementById("events");
    const statsEl = document.getElementById("stats");
+    const latestVisualEl = document.getElementById("latestVisual");

    const state = {
      token: localStorage.getItem("screenjob_token") || "",
@@ -123,6 +128,15 @@ def monitoring_page_html() -> str:
      }
    }

+    function updateLatestVisualFromEvent(ev) {
+      if (!ev || ev.event_type !== "visual_update") return;
+      if (!state.selectedJobId || ev.job_id !== state.selectedJobId) return;
+      const imagePath = ev.payload && ev.payload.image_meta && ev.payload.image_meta.path;
+      if (!imagePath) return;
+      const q = encodeURIComponent(imagePath);
+      latestVisualEl.src = `/api/jobs/${state.selectedJobId}/artifact?path=${q}&token=${encodeURIComponent(state.token)}`;
+    }
+
    async function refreshJobs() {
      const payload = await api("/api/jobs?limit=100");
      state.jobs = payload.jobs || [];
@@ -143,7 +157,10 @@ def monitoring_page_html() -> str:
      ]);
      jobDetailEl.textContent = JSON.stringify(job, null, 2);
      eventsEl.innerHTML = "";
-      for (const ev of (events.events || []).slice().reverse()) pushEventLine(ev);
+      const list = (events.events || []).slice().reverse();
+      for (const ev of list) pushEventLine(ev);
+      const visual = list.find((ev) => ev.event_type === "visual_update");
+      if (visual) updateLatestVisualFromEvent(visual);
    }

    function connectWs() {
@@ -158,6 +175,7 @@ def monitoring_page_html() -> str:
        try {
          const payload = JSON.parse(event.data);
          pushEventLine(payload);
+          updateLatestVisualFromEvent(payload);
          if (!state.selectedJobId || payload.job_id === state.selectedJobId) {
            await refreshJobDetail();
          }
@@ -190,4 +208,3 @@ def monitoring_page_html() -> str:
 </body>
 </html>
 """
-