Files
screenjob/src/server.py
Space-Banane 4123765aba
Some checks failed
CI / test (push) Failing after 8s
Commit remaining workspace updates
2026-05-31 20:43:36 +02:00

452 lines
17 KiB
Python

from __future__ import annotations
import asyncio
import secrets
import socket
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Any
from fastapi import Depends, FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect
from fastapi.responses import FileResponse
from fastapi.responses import HTMLResponse, JSONResponse
from pydantic import BaseModel, Field
from .agent import normalize_disabled_tools
from .config import AppConfig, load_app_config
from .storage import HistoryDB
from .task_manager import JobManager
from .ui import monitoring_js_path, monitoring_page_html
from .utils import utc_now_iso
class CreateJobRequest(BaseModel):
job: str = Field(..., min_length=1)
model: str | None = None
max_steps: int = Field(60, ge=1, le=400)
command_timeout: int = Field(45, ge=1, le=600)
type_interval: float = Field(0.02, ge=0.0, le=1.0)
click_pause: float = Field(0.10, ge=0.0, le=2.0)
reasoning_effort: str = Field("medium", pattern="^(low|medium|high)$")
screen_context_decay_steps: int = Field(4, ge=0, le=50)
max_visual_context_images: int = Field(3, ge=0, le=12)
native_automation_mode: str = Field("prefer", pattern="^(off|prefer|require_fallback)$")
dialog_timeout_seconds: float = Field(12.0, ge=0.5, le=120.0)
focus_timeout_seconds: float = Field(8.0, ge=0.5, le=120.0)
ui_element_timeout_seconds: float = Field(8.0, ge=0.5, le=120.0)
max_retries_per_surface: int = Field(3, ge=1, le=10)
pretty_logs: bool = False
disabled_tools: list[str] = Field(default_factory=list)
safety_override: bool = False
no_failsafe: bool = False
def _safe_int(value: Any) -> int | None:
try:
return int(value)
except Exception: # noqa: BLE001
return None
def _safe_text(value: Any, limit: int = 180) -> str:
text = str(value or "").strip()
if len(text) <= limit:
return text
return f"{text[:limit]}..."
def _resolve_artifact_path(artifacts_dir: Path | None, path_raw: Any) -> Path | None:
if artifacts_dir is None:
return None
text = str(path_raw or "").strip()
if not text:
return None
candidate = Path(text).resolve()
try:
candidate.relative_to(artifacts_dir)
except ValueError:
return None
return candidate
def _extract_replay_action(
event: dict[str, Any],
pending_tool_args: dict[tuple[int, str], list[dict[str, Any]]],
) -> dict[str, Any] | None:
event_type = str(event.get("event_type") or "")
payload = event.get("payload") if isinstance(event.get("payload"), dict) else {}
step = int(event.get("step") or 0)
ts = str(event.get("ts") or "")
event_id = int(event.get("id") or 0)
if event_type == "tool_called":
tool = str(payload.get("tool") or "").strip()
args = payload.get("args") if isinstance(payload.get("args"), dict) else {}
if tool:
pending_tool_args.setdefault((step, tool), []).append(args)
action: dict[str, Any] = {
"ts": ts,
"step": step,
"event_id": event_id,
"kind": "tool_called",
"tool": tool,
"label": f"Call: {tool}" if tool else "Tool call",
}
if tool == "click":
coord = args.get("coordinate") if isinstance(args, dict) else None
if isinstance(coord, dict):
x = _safe_int(coord.get("x"))
y = _safe_int(coord.get("y"))
if x is not None and y is not None:
action["requested_click"] = {"x": x, "y": y}
action["label"] = f"Call: click ({x}, {y})"
elif tool == "type":
text = _safe_text((args or {}).get("text"), 120)
if text:
action["text_preview"] = text
action["label"] = f"Call: type \"{text}\""
return action
if event_type == "tool_result":
tool = str(payload.get("tool") or "").strip()
result = payload.get("result") if isinstance(payload.get("result"), dict) else {}
matching_args: dict[str, Any] = {}
key = (step, tool)
queued = pending_tool_args.get(key) or []
if queued:
matching_args = queued.pop(0)
if not queued:
pending_tool_args.pop(key, None)
action = {
"ts": ts,
"step": step,
"event_id": event_id,
"kind": "tool_result",
"tool": tool,
"ok": bool(result.get("ok")),
"label": f"Result: {tool}",
}
if tool == "click":
clicked = result.get("clicked") if isinstance(result.get("clicked"), dict) else {}
x = _safe_int(clicked.get("x"))
y = _safe_int(clicked.get("y"))
if x is not None and y is not None:
action["click"] = {"x": x, "y": y}
action["label"] = f"Clicked ({x}, {y})" if bool(result.get("ok")) else f"Click failed ({x}, {y})"
elif tool == "type":
text = _safe_text((matching_args or {}).get("text"), 120)
typed_length = _safe_int(result.get("typed_length"))
if typed_length is not None:
action["typed_length"] = typed_length
if text:
action["text_preview"] = text
action["label"] = f"Typed \"{text}\""
elif tool == "press_key":
key_name = _safe_text(result.get("key"), 80)
if key_name:
action["label"] = f"Pressed {key_name}"
elif tool == "execute_command":
command = _safe_text((matching_args or {}).get("command"), 140)
if command:
action["command_preview"] = command
action["label"] = f"Command: {command}"
return action
return None
def _build_replay_payload(job_id: str, job: dict[str, Any], events: list[dict[str, Any]]) -> dict[str, Any]:
artifacts_dir_raw = str(job.get("artifacts_dir") or "").strip()
artifacts_dir = Path(artifacts_dir_raw).resolve() if artifacts_dir_raw else None
pending_tool_args: dict[tuple[int, str], list[dict[str, Any]]] = {}
buffered_actions: list[dict[str, Any]] = []
frames: list[dict[str, Any]] = []
for event in events:
action = _extract_replay_action(event, pending_tool_args)
if action is not None:
buffered_actions.append(action)
if str(event.get("event_type") or "") != "visual_update":
continue
payload = event.get("payload") if isinstance(event.get("payload"), dict) else {}
image_meta = payload.get("image_meta") if isinstance(payload.get("image_meta"), dict) else {}
resolved = _resolve_artifact_path(artifacts_dir, image_meta.get("path"))
if resolved is None or not resolved.exists() or not resolved.is_file():
continue
width = _safe_int(image_meta.get("width"))
height = _safe_int(image_meta.get("height"))
if width is None or height is None:
size = image_meta.get("screen_size") if isinstance(image_meta.get("screen_size"), dict) else {}
width = _safe_int(size.get("width"))
height = _safe_int(size.get("height"))
is_fullscreen = (
str(payload.get("kind") or "") == "see_screen"
and bool(image_meta.get("grid"))
and isinstance(width, int)
and isinstance(height, int)
and width > 0
and height > 0
)
frames.append(
{
"frame_index": len(frames),
"event_id": int(event.get("id") or 0),
"ts": str(event.get("ts") or ""),
"step": int(event.get("step") or 0),
"kind": str(payload.get("kind") or "visual_update"),
"image_path": str(resolved),
"image_meta": image_meta,
"screen_size": {"width": width, "height": height} if width and height else None,
"is_fullscreen": is_fullscreen,
"overlays": buffered_actions,
}
)
buffered_actions = []
return {
"job_id": job_id,
"total_events": len(events),
"total_frames": len(frames),
"frames": frames,
"trailing_events": buffered_actions,
}
class _WebSocketHub:
def __init__(self) -> None:
self._connections: set[WebSocket] = set()
self._lock = asyncio.Lock()
self._loop: asyncio.AbstractEventLoop | None = None
def set_loop(self, loop: asyncio.AbstractEventLoop) -> None:
self._loop = loop
async def connect(self, websocket: WebSocket) -> None:
await websocket.accept()
async with self._lock:
self._connections.add(websocket)
async def disconnect(self, websocket: WebSocket) -> None:
async with self._lock:
self._connections.discard(websocket)
async def broadcast(self, message: dict[str, Any]) -> None:
async with self._lock:
clients = list(self._connections)
dead: list[WebSocket] = []
for ws in clients:
try:
await ws.send_json(message)
except Exception: # noqa: BLE001
dead.append(ws)
if dead:
async with self._lock:
for ws in dead:
self._connections.discard(ws)
def broadcast_from_thread(self, message: dict[str, Any]) -> None:
if self._loop is None:
return
asyncio.run_coroutine_threadsafe(self.broadcast(message), self._loop)
def create_app(config: AppConfig | None = None) -> FastAPI:
app_config = config or load_app_config(cwd=Path.cwd())
if not app_config.openai_api_key:
raise RuntimeError("OPENAI_API_KEY is required in environment or .env.")
if not app_config.screenjob_token:
raise RuntimeError("SCREENJOB_TOKEN is required in environment or .env.")
db = HistoryDB(app_config.db_path)
ws_hub = _WebSocketHub()
manager = JobManager(config=app_config, db=db, broadcast=ws_hub.broadcast_from_thread)
@asynccontextmanager
async def lifespan(_: FastAPI):
ws_hub.set_loop(asyncio.get_running_loop())
yield
app = FastAPI(title="ScreenJob API", version="1.0.0", lifespan=lifespan)
app.state.config = app_config
app.state.db = db
app.state.ws_hub = ws_hub
app.state.manager = manager
device_hostname = socket.gethostname()
def _extract_token(
authorization: str | None,
x_screenjob_token: str | None,
query_token: str | None,
) -> str:
if query_token:
return query_token.strip()
if x_screenjob_token:
return x_screenjob_token.strip()
if authorization:
token = authorization.strip()
if token.lower().startswith("bearer "):
return token[7:].strip()
return token
return ""
def require_token(
authorization: str | None = Header(default=None),
x_screenjob_token: str | None = Header(default=None),
token: str | None = Query(default=None),
) -> None:
resolved = _extract_token(authorization, x_screenjob_token, token)
if not resolved or not secrets.compare_digest(resolved, app_config.screenjob_token):
raise HTTPException(status_code=401, detail="Unauthorized")
@app.post("/api/jobs")
def create_job(payload: CreateJobRequest, _: None = Depends(require_token)) -> dict[str, str]:
try:
disabled_tools = normalize_disabled_tools(payload.disabled_tools)
job_id = manager.submit_job(
objective=payload.job,
model=payload.model,
max_steps=payload.max_steps,
command_timeout=payload.command_timeout,
type_interval=payload.type_interval,
click_pause=payload.click_pause,
reasoning_effort=payload.reasoning_effort,
screen_context_decay_steps=payload.screen_context_decay_steps,
max_visual_context_images=payload.max_visual_context_images,
native_automation_mode=payload.native_automation_mode,
dialog_timeout_seconds=payload.dialog_timeout_seconds,
focus_timeout_seconds=payload.focus_timeout_seconds,
ui_element_timeout_seconds=payload.ui_element_timeout_seconds,
max_retries_per_surface=payload.max_retries_per_surface,
pretty_logs=payload.pretty_logs,
disabled_tools=disabled_tools,
safety_override=payload.safety_override,
no_failsafe=payload.no_failsafe,
)
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
return {"job_id": job_id}
@app.get("/api/jobs")
def list_jobs(limit: int = Query(default=100, ge=1, le=500), _: None = Depends(require_token)) -> dict[str, Any]:
return {"jobs": manager.list_jobs(limit=limit)}
@app.get("/api/jobs/{job_id}")
def get_job(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]:
job = manager.get_job(job_id)
if job is None:
raise HTTPException(status_code=404, detail="Job not found")
return job
@app.get("/api/jobs/{job_id}/status")
def get_job_status(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]:
job = manager.get_job(job_id)
if job is None:
raise HTTPException(status_code=404, detail="Job not found")
return job
@app.get("/api/jobs/{job_id}/events")
def get_job_events(
job_id: str,
limit: int = Query(default=500, ge=1, le=5000),
_: None = Depends(require_token),
) -> dict[str, Any]:
job = manager.get_job(job_id)
if job is None:
raise HTTPException(status_code=404, detail="Job not found")
return {"events": manager.get_events(job_id, limit=limit)}
@app.get("/api/jobs/{job_id}/replay")
def get_job_replay(
job_id: str,
limit: int = Query(default=5000, ge=1, le=5000),
_: None = Depends(require_token),
) -> dict[str, Any]:
job = manager.get_job(job_id)
if job is None:
raise HTTPException(status_code=404, detail="Job not found")
events = manager.get_events(job_id, limit=limit)
return _build_replay_payload(job_id, job, events)
@app.post("/api/jobs/{job_id}/cancel")
def cancel_job(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]:
job = manager.get_job(job_id)
if job is None:
raise HTTPException(status_code=404, detail="Job not found")
accepted = manager.cancel_job(job_id)
return {"job_id": job_id, "cancel_requested": bool(accepted)}
@app.get("/api/jobs/{job_id}/artifact")
def get_job_artifact(
job_id: str,
path: str = Query(..., min_length=1),
_: None = Depends(require_token),
) -> FileResponse:
job = manager.get_job(job_id)
if job is None:
raise HTTPException(status_code=404, detail="Job not found")
artifacts_dir_raw = str(job.get("artifacts_dir") or "").strip()
if not artifacts_dir_raw:
raise HTTPException(status_code=404, detail="Artifacts not available yet")
artifacts_dir = Path(artifacts_dir_raw).resolve()
requested = Path(path).resolve()
try:
requested.relative_to(artifacts_dir)
except ValueError as exc:
raise HTTPException(status_code=400, detail="Artifact path is outside job artifacts directory") from exc
if not requested.exists() or not requested.is_file():
raise HTTPException(status_code=404, detail="Artifact not found")
return FileResponse(str(requested))
@app.get("/api/stats")
def stats(_: None = Depends(require_token)) -> dict[str, Any]:
return manager.stats()
@app.get("/api/analytics")
def analytics(_: None = Depends(require_token)) -> dict[str, Any]:
payload = manager.analytics()
payload["generated_at"] = utc_now_iso()
return payload
if not app_config.disable_ui:
@app.get("/", response_class=HTMLResponse)
def ui_root() -> str:
return monitoring_page_html(device_hostname=device_hostname)
@app.get("/ui/monitoring.js")
def ui_monitoring_js() -> FileResponse:
return FileResponse(str(monitoring_js_path()), media_type="application/javascript")
@app.websocket("/ws")
async def ws_endpoint(websocket: WebSocket, token: str = Query(default="")) -> None:
if not token or not secrets.compare_digest(token, app_config.screenjob_token):
await websocket.close(code=1008)
return
await ws_hub.connect(websocket)
try:
await websocket.send_json({"event_type": "connected", "payload": {"ok": True}})
while True:
await websocket.receive_text()
except WebSocketDisconnect:
await ws_hub.disconnect(websocket)
except Exception:
await ws_hub.disconnect(websocket)
else:
@app.get("/", response_class=JSONResponse)
def ui_disabled() -> dict[str, Any]:
return {"ok": True, "ui_disabled": True}
return app
def main() -> None:
import uvicorn
app = create_app(load_app_config(Path.cwd()))
config = app.state.config
uvicorn.run(app, host=config.host, port=config.port, log_level="info")