from __future__ import annotations import asyncio import secrets import socket from contextlib import asynccontextmanager from pathlib import Path from typing import Any from fastapi import Depends, FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect from fastapi.responses import FileResponse from fastapi.responses import HTMLResponse, JSONResponse from pydantic import BaseModel, Field from .agent import normalize_disabled_tools from .config import AppConfig, load_app_config from .storage import HistoryDB from .task_manager import JobManager from .ui import monitoring_js_path, monitoring_page_html from .utils import utc_now_iso class CreateJobRequest(BaseModel): job: str = Field(..., min_length=1) model: str | None = None max_steps: int = Field(60, ge=1, le=400) command_timeout: int = Field(45, ge=1, le=600) type_interval: float = Field(0.02, ge=0.0, le=1.0) click_pause: float = Field(0.10, ge=0.0, le=2.0) reasoning_effort: str = Field("medium", pattern="^(low|medium|high)$") screen_context_decay_steps: int = Field(4, ge=0, le=50) max_visual_context_images: int = Field(3, ge=0, le=12) native_automation_mode: str = Field("prefer", pattern="^(off|prefer|require_fallback)$") dialog_timeout_seconds: float = Field(12.0, ge=0.5, le=120.0) focus_timeout_seconds: float = Field(8.0, ge=0.5, le=120.0) ui_element_timeout_seconds: float = Field(8.0, ge=0.5, le=120.0) max_retries_per_surface: int = Field(3, ge=1, le=10) pretty_logs: bool = False disabled_tools: list[str] = Field(default_factory=list) safety_override: bool = False no_failsafe: bool = False def _safe_int(value: Any) -> int | None: try: return int(value) except Exception: # noqa: BLE001 return None def _safe_text(value: Any, limit: int = 180) -> str: text = str(value or "").strip() if len(text) <= limit: return text return f"{text[:limit]}..." def _resolve_artifact_path(artifacts_dir: Path | None, path_raw: Any) -> Path | None: if artifacts_dir is None: return None text = str(path_raw or "").strip() if not text: return None candidate = Path(text).resolve() try: candidate.relative_to(artifacts_dir) except ValueError: return None return candidate def _extract_replay_action( event: dict[str, Any], pending_tool_args: dict[tuple[int, str], list[dict[str, Any]]], ) -> dict[str, Any] | None: event_type = str(event.get("event_type") or "") payload = event.get("payload") if isinstance(event.get("payload"), dict) else {} step = int(event.get("step") or 0) ts = str(event.get("ts") or "") event_id = int(event.get("id") or 0) if event_type == "tool_called": tool = str(payload.get("tool") or "").strip() args = payload.get("args") if isinstance(payload.get("args"), dict) else {} if tool: pending_tool_args.setdefault((step, tool), []).append(args) action: dict[str, Any] = { "ts": ts, "step": step, "event_id": event_id, "kind": "tool_called", "tool": tool, "label": f"Call: {tool}" if tool else "Tool call", } if tool == "click": coord = args.get("coordinate") if isinstance(args, dict) else None if isinstance(coord, dict): x = _safe_int(coord.get("x")) y = _safe_int(coord.get("y")) if x is not None and y is not None: action["requested_click"] = {"x": x, "y": y} action["label"] = f"Call: click ({x}, {y})" elif tool == "type": text = _safe_text((args or {}).get("text"), 120) if text: action["text_preview"] = text action["label"] = f"Call: type \"{text}\"" return action if event_type == "tool_result": tool = str(payload.get("tool") or "").strip() result = payload.get("result") if isinstance(payload.get("result"), dict) else {} matching_args: dict[str, Any] = {} key = (step, tool) queued = pending_tool_args.get(key) or [] if queued: matching_args = queued.pop(0) if not queued: pending_tool_args.pop(key, None) action = { "ts": ts, "step": step, "event_id": event_id, "kind": "tool_result", "tool": tool, "ok": bool(result.get("ok")), "label": f"Result: {tool}", } if tool == "click": clicked = result.get("clicked") if isinstance(result.get("clicked"), dict) else {} x = _safe_int(clicked.get("x")) y = _safe_int(clicked.get("y")) if x is not None and y is not None: action["click"] = {"x": x, "y": y} action["label"] = f"Clicked ({x}, {y})" if bool(result.get("ok")) else f"Click failed ({x}, {y})" elif tool == "type": text = _safe_text((matching_args or {}).get("text"), 120) typed_length = _safe_int(result.get("typed_length")) if typed_length is not None: action["typed_length"] = typed_length if text: action["text_preview"] = text action["label"] = f"Typed \"{text}\"" elif tool == "press_key": key_name = _safe_text(result.get("key"), 80) if key_name: action["label"] = f"Pressed {key_name}" elif tool == "execute_command": command = _safe_text((matching_args or {}).get("command"), 140) if command: action["command_preview"] = command action["label"] = f"Command: {command}" return action return None def _build_replay_payload(job_id: str, job: dict[str, Any], events: list[dict[str, Any]]) -> dict[str, Any]: artifacts_dir_raw = str(job.get("artifacts_dir") or "").strip() artifacts_dir = Path(artifacts_dir_raw).resolve() if artifacts_dir_raw else None pending_tool_args: dict[tuple[int, str], list[dict[str, Any]]] = {} buffered_actions: list[dict[str, Any]] = [] frames: list[dict[str, Any]] = [] for event in events: action = _extract_replay_action(event, pending_tool_args) if action is not None: buffered_actions.append(action) if str(event.get("event_type") or "") != "visual_update": continue payload = event.get("payload") if isinstance(event.get("payload"), dict) else {} image_meta = payload.get("image_meta") if isinstance(payload.get("image_meta"), dict) else {} resolved = _resolve_artifact_path(artifacts_dir, image_meta.get("path")) if resolved is None or not resolved.exists() or not resolved.is_file(): continue width = _safe_int(image_meta.get("width")) height = _safe_int(image_meta.get("height")) if width is None or height is None: size = image_meta.get("screen_size") if isinstance(image_meta.get("screen_size"), dict) else {} width = _safe_int(size.get("width")) height = _safe_int(size.get("height")) is_fullscreen = ( str(payload.get("kind") or "") == "see_screen" and bool(image_meta.get("grid")) and isinstance(width, int) and isinstance(height, int) and width > 0 and height > 0 ) frames.append( { "frame_index": len(frames), "event_id": int(event.get("id") or 0), "ts": str(event.get("ts") or ""), "step": int(event.get("step") or 0), "kind": str(payload.get("kind") or "visual_update"), "image_path": str(resolved), "image_meta": image_meta, "screen_size": {"width": width, "height": height} if width and height else None, "is_fullscreen": is_fullscreen, "overlays": buffered_actions, } ) buffered_actions = [] return { "job_id": job_id, "total_events": len(events), "total_frames": len(frames), "frames": frames, "trailing_events": buffered_actions, } class _WebSocketHub: def __init__(self) -> None: self._connections: set[WebSocket] = set() self._lock = asyncio.Lock() self._loop: asyncio.AbstractEventLoop | None = None def set_loop(self, loop: asyncio.AbstractEventLoop) -> None: self._loop = loop async def connect(self, websocket: WebSocket) -> None: await websocket.accept() async with self._lock: self._connections.add(websocket) async def disconnect(self, websocket: WebSocket) -> None: async with self._lock: self._connections.discard(websocket) async def broadcast(self, message: dict[str, Any]) -> None: async with self._lock: clients = list(self._connections) dead: list[WebSocket] = [] for ws in clients: try: await ws.send_json(message) except Exception: # noqa: BLE001 dead.append(ws) if dead: async with self._lock: for ws in dead: self._connections.discard(ws) def broadcast_from_thread(self, message: dict[str, Any]) -> None: if self._loop is None: return asyncio.run_coroutine_threadsafe(self.broadcast(message), self._loop) def create_app(config: AppConfig | None = None) -> FastAPI: app_config = config or load_app_config(cwd=Path.cwd()) if not app_config.openai_api_key: raise RuntimeError("OPENAI_API_KEY is required in environment or .env.") if not app_config.screenjob_token: raise RuntimeError("SCREENJOB_TOKEN is required in environment or .env.") db = HistoryDB(app_config.db_path) ws_hub = _WebSocketHub() manager = JobManager(config=app_config, db=db, broadcast=ws_hub.broadcast_from_thread) @asynccontextmanager async def lifespan(_: FastAPI): ws_hub.set_loop(asyncio.get_running_loop()) yield app = FastAPI(title="ScreenJob API", version="1.0.0", lifespan=lifespan) app.state.config = app_config app.state.db = db app.state.ws_hub = ws_hub app.state.manager = manager device_hostname = socket.gethostname() def _extract_token( authorization: str | None, x_screenjob_token: str | None, query_token: str | None, ) -> str: if query_token: return query_token.strip() if x_screenjob_token: return x_screenjob_token.strip() if authorization: token = authorization.strip() if token.lower().startswith("bearer "): return token[7:].strip() return token return "" def require_token( authorization: str | None = Header(default=None), x_screenjob_token: str | None = Header(default=None), token: str | None = Query(default=None), ) -> None: resolved = _extract_token(authorization, x_screenjob_token, token) if not resolved or not secrets.compare_digest(resolved, app_config.screenjob_token): raise HTTPException(status_code=401, detail="Unauthorized") @app.post("/api/jobs") def create_job(payload: CreateJobRequest, _: None = Depends(require_token)) -> dict[str, str]: try: disabled_tools = normalize_disabled_tools(payload.disabled_tools) job_id = manager.submit_job( objective=payload.job, model=payload.model, max_steps=payload.max_steps, command_timeout=payload.command_timeout, type_interval=payload.type_interval, click_pause=payload.click_pause, reasoning_effort=payload.reasoning_effort, screen_context_decay_steps=payload.screen_context_decay_steps, max_visual_context_images=payload.max_visual_context_images, native_automation_mode=payload.native_automation_mode, dialog_timeout_seconds=payload.dialog_timeout_seconds, focus_timeout_seconds=payload.focus_timeout_seconds, ui_element_timeout_seconds=payload.ui_element_timeout_seconds, max_retries_per_surface=payload.max_retries_per_surface, pretty_logs=payload.pretty_logs, disabled_tools=disabled_tools, safety_override=payload.safety_override, no_failsafe=payload.no_failsafe, ) except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) from exc return {"job_id": job_id} @app.get("/api/jobs") def list_jobs(limit: int = Query(default=100, ge=1, le=500), _: None = Depends(require_token)) -> dict[str, Any]: return {"jobs": manager.list_jobs(limit=limit)} @app.get("/api/jobs/{job_id}") def get_job(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]: job = manager.get_job(job_id) if job is None: raise HTTPException(status_code=404, detail="Job not found") return job @app.get("/api/jobs/{job_id}/status") def get_job_status(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]: job = manager.get_job(job_id) if job is None: raise HTTPException(status_code=404, detail="Job not found") return job @app.get("/api/jobs/{job_id}/events") def get_job_events( job_id: str, limit: int = Query(default=500, ge=1, le=5000), _: None = Depends(require_token), ) -> dict[str, Any]: job = manager.get_job(job_id) if job is None: raise HTTPException(status_code=404, detail="Job not found") return {"events": manager.get_events(job_id, limit=limit)} @app.get("/api/jobs/{job_id}/replay") def get_job_replay( job_id: str, limit: int = Query(default=5000, ge=1, le=5000), _: None = Depends(require_token), ) -> dict[str, Any]: job = manager.get_job(job_id) if job is None: raise HTTPException(status_code=404, detail="Job not found") events = manager.get_events(job_id, limit=limit) return _build_replay_payload(job_id, job, events) @app.post("/api/jobs/{job_id}/cancel") def cancel_job(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]: job = manager.get_job(job_id) if job is None: raise HTTPException(status_code=404, detail="Job not found") accepted = manager.cancel_job(job_id) return {"job_id": job_id, "cancel_requested": bool(accepted)} @app.get("/api/jobs/{job_id}/artifact") def get_job_artifact( job_id: str, path: str = Query(..., min_length=1), _: None = Depends(require_token), ) -> FileResponse: job = manager.get_job(job_id) if job is None: raise HTTPException(status_code=404, detail="Job not found") artifacts_dir_raw = str(job.get("artifacts_dir") or "").strip() if not artifacts_dir_raw: raise HTTPException(status_code=404, detail="Artifacts not available yet") artifacts_dir = Path(artifacts_dir_raw).resolve() requested = Path(path).resolve() try: requested.relative_to(artifacts_dir) except ValueError as exc: raise HTTPException(status_code=400, detail="Artifact path is outside job artifacts directory") from exc if not requested.exists() or not requested.is_file(): raise HTTPException(status_code=404, detail="Artifact not found") return FileResponse(str(requested)) @app.get("/api/stats") def stats(_: None = Depends(require_token)) -> dict[str, Any]: return manager.stats() @app.get("/api/analytics") def analytics(_: None = Depends(require_token)) -> dict[str, Any]: payload = manager.analytics() payload["generated_at"] = utc_now_iso() return payload if not app_config.disable_ui: @app.get("/", response_class=HTMLResponse) def ui_root() -> str: return monitoring_page_html(device_hostname=device_hostname) @app.get("/ui/monitoring.js") def ui_monitoring_js() -> FileResponse: return FileResponse(str(monitoring_js_path()), media_type="application/javascript") @app.websocket("/ws") async def ws_endpoint(websocket: WebSocket, token: str = Query(default="")) -> None: if not token or not secrets.compare_digest(token, app_config.screenjob_token): await websocket.close(code=1008) return await ws_hub.connect(websocket) try: await websocket.send_json({"event_type": "connected", "payload": {"ok": True}}) while True: await websocket.receive_text() except WebSocketDisconnect: await ws_hub.disconnect(websocket) except Exception: await ws_hub.disconnect(websocket) else: @app.get("/", response_class=JSONResponse) def ui_disabled() -> dict[str, Any]: return {"ok": True, "ui_disabled": True} return app def main() -> None: import uvicorn app = create_app(load_app_config(Path.cwd())) config = app.state.config uvicorn.run(app, host=config.host, port=config.port, log_level="info")