452 lines
17 KiB
Python
452 lines
17 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import secrets
|
|
import socket
|
|
from contextlib import asynccontextmanager
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from fastapi import Depends, FastAPI, Header, HTTPException, Query, WebSocket, WebSocketDisconnect
|
|
from fastapi.responses import FileResponse
|
|
from fastapi.responses import HTMLResponse, JSONResponse
|
|
from pydantic import BaseModel, Field
|
|
|
|
from .agent import normalize_disabled_tools
|
|
from .config import AppConfig, load_app_config
|
|
from .storage import HistoryDB
|
|
from .task_manager import JobManager
|
|
from .ui import monitoring_js_path, monitoring_page_html
|
|
from .utils import utc_now_iso
|
|
|
|
|
|
class CreateJobRequest(BaseModel):
|
|
job: str = Field(..., min_length=1)
|
|
model: str | None = None
|
|
max_steps: int = Field(60, ge=1, le=400)
|
|
command_timeout: int = Field(45, ge=1, le=600)
|
|
type_interval: float = Field(0.02, ge=0.0, le=1.0)
|
|
click_pause: float = Field(0.10, ge=0.0, le=2.0)
|
|
reasoning_effort: str = Field("medium", pattern="^(low|medium|high)$")
|
|
screen_context_decay_steps: int = Field(4, ge=0, le=50)
|
|
max_visual_context_images: int = Field(3, ge=0, le=12)
|
|
native_automation_mode: str = Field("prefer", pattern="^(off|prefer|require_fallback)$")
|
|
dialog_timeout_seconds: float = Field(12.0, ge=0.5, le=120.0)
|
|
focus_timeout_seconds: float = Field(8.0, ge=0.5, le=120.0)
|
|
ui_element_timeout_seconds: float = Field(8.0, ge=0.5, le=120.0)
|
|
max_retries_per_surface: int = Field(3, ge=1, le=10)
|
|
pretty_logs: bool = False
|
|
disabled_tools: list[str] = Field(default_factory=list)
|
|
safety_override: bool = False
|
|
no_failsafe: bool = False
|
|
|
|
|
|
def _safe_int(value: Any) -> int | None:
|
|
try:
|
|
return int(value)
|
|
except Exception: # noqa: BLE001
|
|
return None
|
|
|
|
|
|
def _safe_text(value: Any, limit: int = 180) -> str:
|
|
text = str(value or "").strip()
|
|
if len(text) <= limit:
|
|
return text
|
|
return f"{text[:limit]}..."
|
|
|
|
|
|
def _resolve_artifact_path(artifacts_dir: Path | None, path_raw: Any) -> Path | None:
|
|
if artifacts_dir is None:
|
|
return None
|
|
text = str(path_raw or "").strip()
|
|
if not text:
|
|
return None
|
|
candidate = Path(text).resolve()
|
|
try:
|
|
candidate.relative_to(artifacts_dir)
|
|
except ValueError:
|
|
return None
|
|
return candidate
|
|
|
|
|
|
def _extract_replay_action(
|
|
event: dict[str, Any],
|
|
pending_tool_args: dict[tuple[int, str], list[dict[str, Any]]],
|
|
) -> dict[str, Any] | None:
|
|
event_type = str(event.get("event_type") or "")
|
|
payload = event.get("payload") if isinstance(event.get("payload"), dict) else {}
|
|
step = int(event.get("step") or 0)
|
|
ts = str(event.get("ts") or "")
|
|
event_id = int(event.get("id") or 0)
|
|
|
|
if event_type == "tool_called":
|
|
tool = str(payload.get("tool") or "").strip()
|
|
args = payload.get("args") if isinstance(payload.get("args"), dict) else {}
|
|
if tool:
|
|
pending_tool_args.setdefault((step, tool), []).append(args)
|
|
action: dict[str, Any] = {
|
|
"ts": ts,
|
|
"step": step,
|
|
"event_id": event_id,
|
|
"kind": "tool_called",
|
|
"tool": tool,
|
|
"label": f"Call: {tool}" if tool else "Tool call",
|
|
}
|
|
if tool == "click":
|
|
coord = args.get("coordinate") if isinstance(args, dict) else None
|
|
if isinstance(coord, dict):
|
|
x = _safe_int(coord.get("x"))
|
|
y = _safe_int(coord.get("y"))
|
|
if x is not None and y is not None:
|
|
action["requested_click"] = {"x": x, "y": y}
|
|
action["label"] = f"Call: click ({x}, {y})"
|
|
elif tool == "type":
|
|
text = _safe_text((args or {}).get("text"), 120)
|
|
if text:
|
|
action["text_preview"] = text
|
|
action["label"] = f"Call: type \"{text}\""
|
|
return action
|
|
|
|
if event_type == "tool_result":
|
|
tool = str(payload.get("tool") or "").strip()
|
|
result = payload.get("result") if isinstance(payload.get("result"), dict) else {}
|
|
matching_args: dict[str, Any] = {}
|
|
key = (step, tool)
|
|
queued = pending_tool_args.get(key) or []
|
|
if queued:
|
|
matching_args = queued.pop(0)
|
|
if not queued:
|
|
pending_tool_args.pop(key, None)
|
|
|
|
action = {
|
|
"ts": ts,
|
|
"step": step,
|
|
"event_id": event_id,
|
|
"kind": "tool_result",
|
|
"tool": tool,
|
|
"ok": bool(result.get("ok")),
|
|
"label": f"Result: {tool}",
|
|
}
|
|
if tool == "click":
|
|
clicked = result.get("clicked") if isinstance(result.get("clicked"), dict) else {}
|
|
x = _safe_int(clicked.get("x"))
|
|
y = _safe_int(clicked.get("y"))
|
|
if x is not None and y is not None:
|
|
action["click"] = {"x": x, "y": y}
|
|
action["label"] = f"Clicked ({x}, {y})" if bool(result.get("ok")) else f"Click failed ({x}, {y})"
|
|
elif tool == "type":
|
|
text = _safe_text((matching_args or {}).get("text"), 120)
|
|
typed_length = _safe_int(result.get("typed_length"))
|
|
if typed_length is not None:
|
|
action["typed_length"] = typed_length
|
|
if text:
|
|
action["text_preview"] = text
|
|
action["label"] = f"Typed \"{text}\""
|
|
elif tool == "press_key":
|
|
key_name = _safe_text(result.get("key"), 80)
|
|
if key_name:
|
|
action["label"] = f"Pressed {key_name}"
|
|
elif tool == "execute_command":
|
|
command = _safe_text((matching_args or {}).get("command"), 140)
|
|
if command:
|
|
action["command_preview"] = command
|
|
action["label"] = f"Command: {command}"
|
|
return action
|
|
|
|
return None
|
|
|
|
|
|
def _build_replay_payload(job_id: str, job: dict[str, Any], events: list[dict[str, Any]]) -> dict[str, Any]:
|
|
artifacts_dir_raw = str(job.get("artifacts_dir") or "").strip()
|
|
artifacts_dir = Path(artifacts_dir_raw).resolve() if artifacts_dir_raw else None
|
|
pending_tool_args: dict[tuple[int, str], list[dict[str, Any]]] = {}
|
|
buffered_actions: list[dict[str, Any]] = []
|
|
frames: list[dict[str, Any]] = []
|
|
|
|
for event in events:
|
|
action = _extract_replay_action(event, pending_tool_args)
|
|
if action is not None:
|
|
buffered_actions.append(action)
|
|
|
|
if str(event.get("event_type") or "") != "visual_update":
|
|
continue
|
|
payload = event.get("payload") if isinstance(event.get("payload"), dict) else {}
|
|
image_meta = payload.get("image_meta") if isinstance(payload.get("image_meta"), dict) else {}
|
|
resolved = _resolve_artifact_path(artifacts_dir, image_meta.get("path"))
|
|
if resolved is None or not resolved.exists() or not resolved.is_file():
|
|
continue
|
|
|
|
width = _safe_int(image_meta.get("width"))
|
|
height = _safe_int(image_meta.get("height"))
|
|
if width is None or height is None:
|
|
size = image_meta.get("screen_size") if isinstance(image_meta.get("screen_size"), dict) else {}
|
|
width = _safe_int(size.get("width"))
|
|
height = _safe_int(size.get("height"))
|
|
is_fullscreen = (
|
|
str(payload.get("kind") or "") == "see_screen"
|
|
and bool(image_meta.get("grid"))
|
|
and isinstance(width, int)
|
|
and isinstance(height, int)
|
|
and width > 0
|
|
and height > 0
|
|
)
|
|
|
|
frames.append(
|
|
{
|
|
"frame_index": len(frames),
|
|
"event_id": int(event.get("id") or 0),
|
|
"ts": str(event.get("ts") or ""),
|
|
"step": int(event.get("step") or 0),
|
|
"kind": str(payload.get("kind") or "visual_update"),
|
|
"image_path": str(resolved),
|
|
"image_meta": image_meta,
|
|
"screen_size": {"width": width, "height": height} if width and height else None,
|
|
"is_fullscreen": is_fullscreen,
|
|
"overlays": buffered_actions,
|
|
}
|
|
)
|
|
buffered_actions = []
|
|
|
|
return {
|
|
"job_id": job_id,
|
|
"total_events": len(events),
|
|
"total_frames": len(frames),
|
|
"frames": frames,
|
|
"trailing_events": buffered_actions,
|
|
}
|
|
|
|
|
|
class _WebSocketHub:
|
|
def __init__(self) -> None:
|
|
self._connections: set[WebSocket] = set()
|
|
self._lock = asyncio.Lock()
|
|
self._loop: asyncio.AbstractEventLoop | None = None
|
|
|
|
def set_loop(self, loop: asyncio.AbstractEventLoop) -> None:
|
|
self._loop = loop
|
|
|
|
async def connect(self, websocket: WebSocket) -> None:
|
|
await websocket.accept()
|
|
async with self._lock:
|
|
self._connections.add(websocket)
|
|
|
|
async def disconnect(self, websocket: WebSocket) -> None:
|
|
async with self._lock:
|
|
self._connections.discard(websocket)
|
|
|
|
async def broadcast(self, message: dict[str, Any]) -> None:
|
|
async with self._lock:
|
|
clients = list(self._connections)
|
|
dead: list[WebSocket] = []
|
|
for ws in clients:
|
|
try:
|
|
await ws.send_json(message)
|
|
except Exception: # noqa: BLE001
|
|
dead.append(ws)
|
|
if dead:
|
|
async with self._lock:
|
|
for ws in dead:
|
|
self._connections.discard(ws)
|
|
|
|
def broadcast_from_thread(self, message: dict[str, Any]) -> None:
|
|
if self._loop is None:
|
|
return
|
|
asyncio.run_coroutine_threadsafe(self.broadcast(message), self._loop)
|
|
|
|
|
|
def create_app(config: AppConfig | None = None) -> FastAPI:
|
|
app_config = config or load_app_config(cwd=Path.cwd())
|
|
if not app_config.openai_api_key:
|
|
raise RuntimeError("OPENAI_API_KEY is required in environment or .env.")
|
|
if not app_config.screenjob_token:
|
|
raise RuntimeError("SCREENJOB_TOKEN is required in environment or .env.")
|
|
|
|
db = HistoryDB(app_config.db_path)
|
|
ws_hub = _WebSocketHub()
|
|
manager = JobManager(config=app_config, db=db, broadcast=ws_hub.broadcast_from_thread)
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(_: FastAPI):
|
|
ws_hub.set_loop(asyncio.get_running_loop())
|
|
yield
|
|
|
|
app = FastAPI(title="ScreenJob API", version="1.0.0", lifespan=lifespan)
|
|
|
|
app.state.config = app_config
|
|
app.state.db = db
|
|
app.state.ws_hub = ws_hub
|
|
app.state.manager = manager
|
|
device_hostname = socket.gethostname()
|
|
|
|
def _extract_token(
|
|
authorization: str | None,
|
|
x_screenjob_token: str | None,
|
|
query_token: str | None,
|
|
) -> str:
|
|
if query_token:
|
|
return query_token.strip()
|
|
if x_screenjob_token:
|
|
return x_screenjob_token.strip()
|
|
if authorization:
|
|
token = authorization.strip()
|
|
if token.lower().startswith("bearer "):
|
|
return token[7:].strip()
|
|
return token
|
|
return ""
|
|
|
|
def require_token(
|
|
authorization: str | None = Header(default=None),
|
|
x_screenjob_token: str | None = Header(default=None),
|
|
token: str | None = Query(default=None),
|
|
) -> None:
|
|
resolved = _extract_token(authorization, x_screenjob_token, token)
|
|
if not resolved or not secrets.compare_digest(resolved, app_config.screenjob_token):
|
|
raise HTTPException(status_code=401, detail="Unauthorized")
|
|
|
|
@app.post("/api/jobs")
|
|
def create_job(payload: CreateJobRequest, _: None = Depends(require_token)) -> dict[str, str]:
|
|
try:
|
|
disabled_tools = normalize_disabled_tools(payload.disabled_tools)
|
|
job_id = manager.submit_job(
|
|
objective=payload.job,
|
|
model=payload.model,
|
|
max_steps=payload.max_steps,
|
|
command_timeout=payload.command_timeout,
|
|
type_interval=payload.type_interval,
|
|
click_pause=payload.click_pause,
|
|
reasoning_effort=payload.reasoning_effort,
|
|
screen_context_decay_steps=payload.screen_context_decay_steps,
|
|
max_visual_context_images=payload.max_visual_context_images,
|
|
native_automation_mode=payload.native_automation_mode,
|
|
dialog_timeout_seconds=payload.dialog_timeout_seconds,
|
|
focus_timeout_seconds=payload.focus_timeout_seconds,
|
|
ui_element_timeout_seconds=payload.ui_element_timeout_seconds,
|
|
max_retries_per_surface=payload.max_retries_per_surface,
|
|
pretty_logs=payload.pretty_logs,
|
|
disabled_tools=disabled_tools,
|
|
safety_override=payload.safety_override,
|
|
no_failsafe=payload.no_failsafe,
|
|
)
|
|
except ValueError as exc:
|
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
return {"job_id": job_id}
|
|
|
|
@app.get("/api/jobs")
|
|
def list_jobs(limit: int = Query(default=100, ge=1, le=500), _: None = Depends(require_token)) -> dict[str, Any]:
|
|
return {"jobs": manager.list_jobs(limit=limit)}
|
|
|
|
@app.get("/api/jobs/{job_id}")
|
|
def get_job(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]:
|
|
job = manager.get_job(job_id)
|
|
if job is None:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
return job
|
|
|
|
@app.get("/api/jobs/{job_id}/status")
|
|
def get_job_status(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]:
|
|
job = manager.get_job(job_id)
|
|
if job is None:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
return job
|
|
|
|
@app.get("/api/jobs/{job_id}/events")
|
|
def get_job_events(
|
|
job_id: str,
|
|
limit: int = Query(default=500, ge=1, le=5000),
|
|
_: None = Depends(require_token),
|
|
) -> dict[str, Any]:
|
|
job = manager.get_job(job_id)
|
|
if job is None:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
return {"events": manager.get_events(job_id, limit=limit)}
|
|
|
|
@app.get("/api/jobs/{job_id}/replay")
|
|
def get_job_replay(
|
|
job_id: str,
|
|
limit: int = Query(default=5000, ge=1, le=5000),
|
|
_: None = Depends(require_token),
|
|
) -> dict[str, Any]:
|
|
job = manager.get_job(job_id)
|
|
if job is None:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
events = manager.get_events(job_id, limit=limit)
|
|
return _build_replay_payload(job_id, job, events)
|
|
|
|
@app.post("/api/jobs/{job_id}/cancel")
|
|
def cancel_job(job_id: str, _: None = Depends(require_token)) -> dict[str, Any]:
|
|
job = manager.get_job(job_id)
|
|
if job is None:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
accepted = manager.cancel_job(job_id)
|
|
return {"job_id": job_id, "cancel_requested": bool(accepted)}
|
|
|
|
@app.get("/api/jobs/{job_id}/artifact")
|
|
def get_job_artifact(
|
|
job_id: str,
|
|
path: str = Query(..., min_length=1),
|
|
_: None = Depends(require_token),
|
|
) -> FileResponse:
|
|
job = manager.get_job(job_id)
|
|
if job is None:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
artifacts_dir_raw = str(job.get("artifacts_dir") or "").strip()
|
|
if not artifacts_dir_raw:
|
|
raise HTTPException(status_code=404, detail="Artifacts not available yet")
|
|
artifacts_dir = Path(artifacts_dir_raw).resolve()
|
|
requested = Path(path).resolve()
|
|
try:
|
|
requested.relative_to(artifacts_dir)
|
|
except ValueError as exc:
|
|
raise HTTPException(status_code=400, detail="Artifact path is outside job artifacts directory") from exc
|
|
if not requested.exists() or not requested.is_file():
|
|
raise HTTPException(status_code=404, detail="Artifact not found")
|
|
return FileResponse(str(requested))
|
|
|
|
@app.get("/api/stats")
|
|
def stats(_: None = Depends(require_token)) -> dict[str, Any]:
|
|
return manager.stats()
|
|
|
|
@app.get("/api/analytics")
|
|
def analytics(_: None = Depends(require_token)) -> dict[str, Any]:
|
|
payload = manager.analytics()
|
|
payload["generated_at"] = utc_now_iso()
|
|
return payload
|
|
|
|
if not app_config.disable_ui:
|
|
@app.get("/", response_class=HTMLResponse)
|
|
def ui_root() -> str:
|
|
return monitoring_page_html(device_hostname=device_hostname)
|
|
|
|
@app.get("/ui/monitoring.js")
|
|
def ui_monitoring_js() -> FileResponse:
|
|
return FileResponse(str(monitoring_js_path()), media_type="application/javascript")
|
|
|
|
@app.websocket("/ws")
|
|
async def ws_endpoint(websocket: WebSocket, token: str = Query(default="")) -> None:
|
|
if not token or not secrets.compare_digest(token, app_config.screenjob_token):
|
|
await websocket.close(code=1008)
|
|
return
|
|
await ws_hub.connect(websocket)
|
|
try:
|
|
await websocket.send_json({"event_type": "connected", "payload": {"ok": True}})
|
|
while True:
|
|
await websocket.receive_text()
|
|
except WebSocketDisconnect:
|
|
await ws_hub.disconnect(websocket)
|
|
except Exception:
|
|
await ws_hub.disconnect(websocket)
|
|
else:
|
|
@app.get("/", response_class=JSONResponse)
|
|
def ui_disabled() -> dict[str, Any]:
|
|
return {"ok": True, "ui_disabled": True}
|
|
|
|
return app
|
|
|
|
|
|
def main() -> None:
|
|
import uvicorn
|
|
|
|
app = create_app(load_app_config(Path.cwd()))
|
|
config = app.state.config
|
|
uvicorn.run(app, host=config.host, port=config.port, log_level="info")
|