480 lines
19 KiB
Python
480 lines
19 KiB
Python
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from fastapi.testclient import TestClient
|
|
|
|
import src.server as server_module
|
|
from src.config import AppConfig
|
|
|
|
|
|
_TERMINAL_STATUSES = {"completed", "failed", "cancelled"}
|
|
|
|
|
|
def _objective_category(objective: str) -> str:
|
|
text = objective.lower()
|
|
if any(keyword in text for keyword in ("browser", "website", "amazon", "google", "login", "shopping", "checkout", "orders")):
|
|
return "Browser / web"
|
|
if any(keyword in text for keyword in ("file", "folder", "directory", "terminal", "shell", "command", "cli", "script", "git", "repo", "install", "pip", "npm")):
|
|
return "Files / terminal"
|
|
if any(keyword in text for keyword in ("write", "summary", "document", "docs", "report", "email", "message", "readme", "markdown")):
|
|
return "Writing / docs"
|
|
if any(keyword in text for keyword in ("data", "analysis", "csv", "spreadsheet", "sheet", "table", "chart", "dashboard", "metric", "sql")):
|
|
return "Data / analysis"
|
|
if any(keyword in text for keyword in ("code", "bug", "fix", "test", "debug", "api", "backend", "frontend", "database", "deploy", "docker", "service", "build")):
|
|
return "Development / ops"
|
|
return "Other"
|
|
|
|
|
|
class FakeJobManager:
|
|
def __init__(self, *, config: AppConfig, db: Any, broadcast: Any = None) -> None:
|
|
self.config = config
|
|
self._jobs: dict[str, dict[str, Any]] = {}
|
|
self._events: dict[str, list[dict[str, Any]]] = {}
|
|
self._counter = 0
|
|
self.last_submit_payload: dict[str, Any] | None = None
|
|
|
|
def submit_job(
|
|
self,
|
|
*,
|
|
objective: str,
|
|
model: str | None = None,
|
|
max_steps: int = 60,
|
|
command_timeout: int = 45,
|
|
type_interval: float = 0.02,
|
|
click_pause: float = 0.10,
|
|
reasoning_effort: str = "medium",
|
|
screen_context_decay_steps: int = 4,
|
|
disabled_tools: list[str] | None = None,
|
|
safety_override: bool = False,
|
|
no_failsafe: bool = False,
|
|
) -> str:
|
|
self._counter += 1
|
|
job_id = f"job_fake_{self._counter:03d}"
|
|
selected_model = (model or self.config.default_model).strip()
|
|
artifacts_dir = (self.config.runs_dir / f"run_{job_id}").resolve()
|
|
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
|
screenshot_path = artifacts_dir / "screen_step_001.png"
|
|
screenshot_path.write_bytes(b"not-a-real-png")
|
|
created_at = f"2026-05-27T00:00:{self._counter:02d}Z"
|
|
self.last_submit_payload = {
|
|
"objective": objective,
|
|
"model": selected_model,
|
|
"disabled_tools": disabled_tools or [],
|
|
"safety_override": safety_override,
|
|
"max_steps": max_steps,
|
|
"command_timeout": command_timeout,
|
|
"type_interval": type_interval,
|
|
"click_pause": click_pause,
|
|
"reasoning_effort": reasoning_effort,
|
|
"screen_context_decay_steps": screen_context_decay_steps,
|
|
"no_failsafe": no_failsafe,
|
|
}
|
|
self._jobs[job_id] = {
|
|
"job_id": job_id,
|
|
"objective": objective,
|
|
"model": selected_model,
|
|
"status": "running",
|
|
"created_at": created_at,
|
|
"started_at": created_at,
|
|
"ended_at": None,
|
|
"steps": 1,
|
|
"result": "Running",
|
|
"response": {"return": "Running", "data": None},
|
|
"return": "Running",
|
|
"data": None,
|
|
"usage": {
|
|
"input_tokens": 10,
|
|
"cached_input_tokens": 2,
|
|
"output_tokens": 4,
|
|
"reasoning_tokens": 0,
|
|
"total_tokens": 14,
|
|
"estimated_cost_usd": 0.0001,
|
|
},
|
|
"artifacts_dir": str(artifacts_dir),
|
|
}
|
|
self._events[job_id] = [
|
|
{
|
|
"id": 1,
|
|
"job_id": job_id,
|
|
"ts": "2026-05-27T00:00:00Z",
|
|
"step": 1,
|
|
"event_type": "tool_called",
|
|
"payload": {"tool": "click", "args": {"coordinate": {"x": 320, "y": 180}}},
|
|
},
|
|
{
|
|
"id": 2,
|
|
"job_id": job_id,
|
|
"ts": "2026-05-27T00:00:01Z",
|
|
"step": 1,
|
|
"event_type": "tool_result",
|
|
"payload": {"tool": "click", "result": {"ok": True, "clicked": {"x": 322, "y": 182}}},
|
|
},
|
|
{
|
|
"id": 3,
|
|
"job_id": job_id,
|
|
"ts": "2026-05-27T00:00:02Z",
|
|
"step": 1,
|
|
"event_type": "tool_called",
|
|
"payload": {"tool": "type", "args": {"text": "hello world"}},
|
|
},
|
|
{
|
|
"id": 4,
|
|
"job_id": job_id,
|
|
"ts": "2026-05-27T00:00:03Z",
|
|
"step": 1,
|
|
"event_type": "tool_result",
|
|
"payload": {"tool": "type", "result": {"ok": True, "typed_length": 11}},
|
|
},
|
|
{
|
|
"id": 5,
|
|
"job_id": job_id,
|
|
"ts": "2026-05-27T00:00:04Z",
|
|
"step": 1,
|
|
"event_type": "visual_update",
|
|
"payload": {
|
|
"kind": "see_screen",
|
|
"image_meta": {
|
|
"path": str(screenshot_path),
|
|
"width": 1920,
|
|
"height": 1080,
|
|
"grid": True,
|
|
},
|
|
},
|
|
}
|
|
]
|
|
return job_id
|
|
|
|
def list_jobs(self, limit: int = 100) -> list[dict[str, Any]]:
|
|
return list(self._jobs.values())[:limit]
|
|
|
|
def get_job(self, job_id: str) -> dict[str, Any] | None:
|
|
return self._jobs.get(job_id)
|
|
|
|
def get_events(self, job_id: str, limit: int = 500) -> list[dict[str, Any]]:
|
|
return self._events.get(job_id, [])[:limit]
|
|
|
|
def cancel_job(self, job_id: str) -> bool:
|
|
if job_id not in self._jobs:
|
|
return False
|
|
self._jobs[job_id]["status"] = "cancelling"
|
|
return True
|
|
|
|
def stats(self) -> dict[str, Any]:
|
|
return {
|
|
"total_jobs": len(self._jobs),
|
|
"running_jobs": sum(1 for x in self._jobs.values() if x["status"] == "running"),
|
|
"completed_jobs": 0,
|
|
"failed_jobs": 0,
|
|
"cancelled_jobs": 0,
|
|
"total_estimated_cost": sum(float((x["usage"] or {}).get("estimated_cost_usd") or 0) for x in self._jobs.values()),
|
|
"live_running_threads": 0,
|
|
}
|
|
|
|
def analytics(self) -> dict[str, Any]:
|
|
by_category: dict[str, dict[str, Any]] = {}
|
|
by_day: dict[str, dict[str, Any]] = {}
|
|
|
|
def bucket(target: dict[str, dict[str, Any]], key: str) -> dict[str, Any]:
|
|
return target.setdefault(
|
|
key,
|
|
{
|
|
"label": key,
|
|
"total_jobs": 0,
|
|
"finished_jobs": 0,
|
|
"completed_jobs": 0,
|
|
"failed_jobs": 0,
|
|
"cancelled_jobs": 0,
|
|
"steps_sum": 0,
|
|
"steps_count": 0,
|
|
"cost_sum": 0.0,
|
|
"cost_count": 0,
|
|
},
|
|
)
|
|
|
|
total_jobs = 0
|
|
finished_jobs = 0
|
|
completed_jobs = 0
|
|
failed_jobs = 0
|
|
cancelled_jobs = 0
|
|
steps_sum = 0
|
|
steps_count = 0
|
|
cost_sum = 0.0
|
|
cost_count = 0
|
|
|
|
for job in self._jobs.values():
|
|
total_jobs += 1
|
|
status = str(job.get("status") or "")
|
|
finished = status in _TERMINAL_STATUSES
|
|
category = _objective_category(str(job.get("objective") or ""))
|
|
day = str(job.get("created_at") or "")[:10] or "unknown"
|
|
|
|
category_bucket = bucket(by_category, category)
|
|
day_bucket = bucket(by_day, day)
|
|
for item in (category_bucket, day_bucket):
|
|
item["total_jobs"] += 1
|
|
|
|
if not finished:
|
|
continue
|
|
|
|
finished_jobs += 1
|
|
if status == "completed":
|
|
completed_jobs += 1
|
|
elif status == "failed":
|
|
failed_jobs += 1
|
|
elif status == "cancelled":
|
|
cancelled_jobs += 1
|
|
|
|
steps_raw = job.get("steps")
|
|
if steps_raw is not None:
|
|
steps = int(steps_raw)
|
|
steps_sum += steps
|
|
steps_count += 1
|
|
for item in (category_bucket, day_bucket):
|
|
item["steps_sum"] += steps
|
|
item["steps_count"] += 1
|
|
|
|
estimated_cost_raw = (job.get("usage") or {}).get("estimated_cost_usd")
|
|
if estimated_cost_raw is not None:
|
|
estimated_cost = float(estimated_cost_raw)
|
|
cost_sum += estimated_cost
|
|
cost_count += 1
|
|
for item in (category_bucket, day_bucket):
|
|
item["cost_sum"] += estimated_cost
|
|
item["cost_count"] += 1
|
|
|
|
for item in (category_bucket, day_bucket):
|
|
item["finished_jobs"] += 1
|
|
if status == "completed":
|
|
item["completed_jobs"] += 1
|
|
elif status == "failed":
|
|
item["failed_jobs"] += 1
|
|
elif status == "cancelled":
|
|
item["cancelled_jobs"] += 1
|
|
|
|
def finalize(item: dict[str, Any]) -> dict[str, Any]:
|
|
finished = item["finished_jobs"]
|
|
return {
|
|
"label": item["label"],
|
|
"total_jobs": item["total_jobs"],
|
|
"finished_jobs": finished,
|
|
"completed_jobs": item["completed_jobs"],
|
|
"failed_jobs": item["failed_jobs"],
|
|
"cancelled_jobs": item["cancelled_jobs"],
|
|
"success_rate": round((item["completed_jobs"] / finished) * 100, 2) if finished else 0.0,
|
|
"avg_steps": round(item["steps_sum"] / item["steps_count"], 2) if item["steps_count"] else None,
|
|
"avg_cost_usd": round(item["cost_sum"] / item["cost_count"], 6) if item["cost_count"] else None,
|
|
}
|
|
|
|
return {
|
|
"total_jobs": total_jobs,
|
|
"finished_jobs": finished_jobs,
|
|
"completed_jobs": completed_jobs,
|
|
"failed_jobs": failed_jobs,
|
|
"cancelled_jobs": cancelled_jobs,
|
|
"success_rate": round((completed_jobs / finished_jobs) * 100, 2) if finished_jobs else 0.0,
|
|
"avg_steps": round(steps_sum / steps_count, 2) if steps_count else None,
|
|
"avg_cost_usd": round(cost_sum / cost_count, 6) if cost_count else None,
|
|
"by_category": sorted((finalize(item) for item in by_category.values()), key=lambda item: (-item["success_rate"], item["label"])),
|
|
"timeline": sorted((finalize(item) for item in by_day.values()), key=lambda item: item["label"]),
|
|
}
|
|
|
|
|
|
def _build_app(tmp_path: Path, monkeypatch: Any, disable_ui: bool = False):
|
|
monkeypatch.setattr(server_module, "JobManager", FakeJobManager)
|
|
config = AppConfig(
|
|
openai_api_key="test_key",
|
|
screenjob_token="test_token",
|
|
disable_ui=disable_ui,
|
|
default_model="gpt-5.4-mini",
|
|
safety_model="gpt-5.4-mini",
|
|
host="127.0.0.1",
|
|
port=8787,
|
|
runs_dir=tmp_path / "runs",
|
|
db_path=tmp_path / "screenjob_test.db",
|
|
)
|
|
config.runs_dir.mkdir(parents=True, exist_ok=True)
|
|
app = server_module.create_app(config)
|
|
return app, config
|
|
|
|
|
|
def test_api_requires_auth(tmp_path: Path, monkeypatch: Any) -> None:
|
|
app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False)
|
|
client = TestClient(app)
|
|
assert client.get("/api/jobs").status_code == 401
|
|
assert client.post("/api/jobs", json={"job": "x"}).status_code == 401
|
|
|
|
|
|
def test_create_job_returns_only_job_id_and_defaults_model(tmp_path: Path, monkeypatch: Any) -> None:
|
|
app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False)
|
|
client = TestClient(app)
|
|
headers = {"Authorization": "Bearer test_token"}
|
|
|
|
response = client.post(
|
|
"/api/jobs",
|
|
headers=headers,
|
|
json={"job": "Open amazon.de", "disabled_tools": ["click"], "safety_override": True},
|
|
)
|
|
assert response.status_code == 200
|
|
payload = response.json()
|
|
assert list(payload.keys()) == ["job_id"]
|
|
job_id = payload["job_id"]
|
|
|
|
manager = app.state.manager
|
|
assert manager.last_submit_payload["model"] == "gpt-5.4-mini"
|
|
assert manager.last_submit_payload["disabled_tools"] == ["click"]
|
|
assert manager.last_submit_payload["reasoning_effort"] == "medium"
|
|
assert manager.last_submit_payload["screen_context_decay_steps"] == 4
|
|
|
|
status_res = client.get(f"/api/jobs/{job_id}/status", headers=headers)
|
|
assert status_res.status_code == 200
|
|
assert status_res.json()["job_id"] == job_id
|
|
assert status_res.json()["response"]["return"] == "Running"
|
|
assert "data" in status_res.json()["response"]
|
|
|
|
|
|
def test_cancel_endpoint_and_events(tmp_path: Path, monkeypatch: Any) -> None:
|
|
app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False)
|
|
client = TestClient(app)
|
|
headers = {"Authorization": "Bearer test_token"}
|
|
create = client.post("/api/jobs", headers=headers, json={"job": "Test job"})
|
|
job_id = create.json()["job_id"]
|
|
|
|
events = client.get(f"/api/jobs/{job_id}/events?limit=20", headers=headers)
|
|
assert events.status_code == 200
|
|
assert len(events.json()["events"]) >= 1
|
|
|
|
cancel = client.post(f"/api/jobs/{job_id}/cancel", headers=headers)
|
|
assert cancel.status_code == 200
|
|
assert cancel.json()["cancel_requested"] is True
|
|
|
|
status_after = client.get(f"/api/jobs/{job_id}", headers=headers).json()
|
|
assert status_after["status"] == "cancelling"
|
|
assert status_after["return"] == "Running"
|
|
assert status_after["data"] is None
|
|
|
|
|
|
def test_replay_endpoint_builds_frames_and_overlays(tmp_path: Path, monkeypatch: Any) -> None:
|
|
app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False)
|
|
client = TestClient(app)
|
|
headers = {"Authorization": "Bearer test_token"}
|
|
create = client.post("/api/jobs", headers=headers, json={"job": "Replay test"})
|
|
job_id = create.json()["job_id"]
|
|
|
|
replay = client.get(f"/api/jobs/{job_id}/replay?limit=200", headers=headers)
|
|
assert replay.status_code == 200
|
|
payload = replay.json()
|
|
assert payload["job_id"] == job_id
|
|
assert payload["total_frames"] == 1
|
|
frame = payload["frames"][0]
|
|
assert frame["kind"] == "see_screen"
|
|
assert frame["is_fullscreen"] is True
|
|
labels = [item.get("label", "") for item in frame["overlays"]]
|
|
assert any("click" in text.lower() for text in labels)
|
|
assert any("typed" in text.lower() for text in labels)
|
|
|
|
|
|
def test_replay_endpoint_skips_visual_paths_outside_artifacts(tmp_path: Path, monkeypatch: Any) -> None:
|
|
app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False)
|
|
manager = app.state.manager
|
|
client = TestClient(app)
|
|
headers = {"Authorization": "Bearer test_token"}
|
|
create = client.post("/api/jobs", headers=headers, json={"job": "Replay path check"})
|
|
job_id = create.json()["job_id"]
|
|
manager._events[job_id].append(
|
|
{
|
|
"id": 999,
|
|
"job_id": job_id,
|
|
"ts": "2026-05-27T00:01:00Z",
|
|
"step": 2,
|
|
"event_type": "visual_update",
|
|
"payload": {
|
|
"kind": "see_screen",
|
|
"image_meta": {
|
|
"path": str((tmp_path / "outside.png").resolve()),
|
|
"width": 100,
|
|
"height": 100,
|
|
"grid": True,
|
|
},
|
|
},
|
|
}
|
|
)
|
|
|
|
replay = client.get(f"/api/jobs/{job_id}/replay?limit=500", headers=headers)
|
|
assert replay.status_code == 200
|
|
payload = replay.json()
|
|
assert payload["total_frames"] == 1
|
|
|
|
|
|
def test_analytics_endpoint_groups_by_category_and_time(tmp_path: Path, monkeypatch: Any) -> None:
|
|
app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False)
|
|
manager = app.state.manager
|
|
client = TestClient(app)
|
|
headers = {"Authorization": "Bearer test_token"}
|
|
|
|
browser_completed = client.post("/api/jobs", headers=headers, json={"job": "Open amazon.de and checkout"}).json()["job_id"]
|
|
browser_failed = client.post("/api/jobs", headers=headers, json={"job": "Open website and login"}).json()["job_id"]
|
|
terminal_completed = client.post("/api/jobs", headers=headers, json={"job": "Run a shell command to inspect files"}).json()["job_id"]
|
|
|
|
manager._jobs[browser_completed].update(
|
|
status="completed",
|
|
ended_at="2026-05-27T00:10:00Z",
|
|
steps=4,
|
|
created_at="2026-05-27T00:00:01Z",
|
|
usage={**manager._jobs[browser_completed]["usage"], "estimated_cost_usd": 0.12},
|
|
)
|
|
manager._jobs[browser_failed].update(
|
|
status="failed",
|
|
ended_at="2026-05-28T00:10:00Z",
|
|
steps=6,
|
|
created_at="2026-05-28T00:00:01Z",
|
|
usage={**manager._jobs[browser_failed]["usage"], "estimated_cost_usd": 0.24},
|
|
)
|
|
manager._jobs[terminal_completed].update(
|
|
status="completed",
|
|
ended_at="2026-05-28T00:15:00Z",
|
|
steps=10,
|
|
created_at="2026-05-28T00:00:02Z",
|
|
usage={**manager._jobs[terminal_completed]["usage"], "estimated_cost_usd": 0.05},
|
|
)
|
|
|
|
analytics = client.get("/api/analytics", headers=headers)
|
|
assert analytics.status_code == 200
|
|
payload = analytics.json()
|
|
|
|
assert payload["total_jobs"] == 3
|
|
assert payload["finished_jobs"] == 3
|
|
assert payload["completed_jobs"] == 2
|
|
assert payload["failed_jobs"] == 1
|
|
assert payload["success_rate"] == 66.67
|
|
assert payload["avg_steps"] == 6.67
|
|
assert payload["avg_cost_usd"] == 0.136667
|
|
|
|
browser = next(row for row in payload["by_category"] if row["label"] == "Browser / web")
|
|
terminal = next(row for row in payload["by_category"] if row["label"] == "Files / terminal")
|
|
assert browser["finished_jobs"] == 2
|
|
assert browser["success_rate"] == 50.0
|
|
assert browser["avg_steps"] == 5.0
|
|
assert terminal["success_rate"] == 100.0
|
|
|
|
assert [row["label"] for row in payload["timeline"]] == ["2026-05-27", "2026-05-28"]
|
|
|
|
|
|
def test_ui_toggle(tmp_path: Path, monkeypatch: Any) -> None:
|
|
app_enabled, _ = _build_app(tmp_path / "enabled", monkeypatch, disable_ui=False)
|
|
client_enabled = TestClient(app_enabled)
|
|
root_enabled = client_enabled.get("/")
|
|
assert root_enabled.status_code == 200
|
|
assert "ScreenJob Monitor" in root_enabled.text
|
|
assert "Success by Objective Category" in root_enabled.text
|
|
js_enabled = client_enabled.get("/ui/monitoring.js")
|
|
assert js_enabled.status_code == 200
|
|
assert "const tokenInput" in js_enabled.text
|
|
|
|
app_disabled, _ = _build_app(tmp_path / "disabled", monkeypatch, disable_ui=True)
|
|
client_disabled = TestClient(app_disabled)
|
|
root_disabled = client_disabled.get("/")
|
|
assert root_disabled.status_code == 200
|
|
assert root_disabled.json()["ui_disabled"] is True
|