From 8126b57404580f7d1eebf5d6fc51e66ee42fc28f Mon Sep 17 00:00:00 2001 From: Space-Banane Date: Wed, 27 May 2026 22:34:26 +0200 Subject: [PATCH] Add lightweight analytics dashboard Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 6 + src/server.py | 7 ++ src/storage.py | 158 ++++++++++++++++++++++++++ src/task_manager.py | 3 + src/ui_assets/monitoring.html | 24 ++++ src/ui_assets/monitoring.js | 207 ++++++++++++++++++++++++++++++++++ tests/test_server_api.py | 186 ++++++++++++++++++++++++++++++ tests/test_storage.py | 52 +++++++++ todo.md | 2 +- 9 files changed, 644 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 97ad90c..cad5954 100644 --- a/README.md +++ b/README.md @@ -156,8 +156,14 @@ Each job payload includes: - Read-only dashboard (no run controls) - Requires token input - Live updates via `/ws` +- Analytics dashboards for success rate by objective category and daily averages - Set `DISABLE_UI=true` to disable UI +### Analytics API + +- `GET /api/analytics` +- Returns objective-category success rates plus average steps/cost over time + ## Agent Instructions (Practical) - Prefer `execute_command` for deterministic actions (opening URLs, filesystem checks). diff --git a/src/server.py b/src/server.py index d27b9ad..9c01d67 100644 --- a/src/server.py +++ b/src/server.py @@ -16,6 +16,7 @@ from .config import AppConfig, load_app_config from .storage import HistoryDB from .task_manager import JobManager from .ui import monitoring_js_path, monitoring_page_html +from .utils import utc_now_iso class CreateJobRequest(BaseModel): @@ -386,6 +387,12 @@ def create_app(config: AppConfig | None = None) -> FastAPI: def stats(_: None = Depends(require_token)) -> dict[str, Any]: return manager.stats() + @app.get("/api/analytics") + def analytics(_: None = Depends(require_token)) -> dict[str, Any]: + payload = manager.analytics() + payload["generated_at"] = utc_now_iso() + return payload + if not app_config.disable_ui: @app.get("/", response_class=HTMLResponse) def ui_root() -> str: diff --git a/src/storage.py b/src/storage.py index 3abd7eb..0c2669b 100644 --- a/src/storage.py +++ b/src/storage.py @@ -7,6 +7,39 @@ from pathlib import Path from typing import Any +_TERMINAL_STATUSES = {"completed", "failed", "cancelled"} +_CATEGORY_RULES: tuple[tuple[str, tuple[str, ...]], ...] = ( + ( + "Browser / web", + ("browser", "website", "webpage", "chrome", "url", "amazon", "google", "login", "shopping", "checkout", "orders"), + ), + ( + "Files / terminal", + ("file", "folder", "directory", "terminal", "shell", "command", "cli", "script", "git", "repo", "install", "pip", "npm", "powershell", "bash"), + ), + ( + "Writing / docs", + ("write", "summary", "summarize", "document", "docs", "report", "email", "message", "readme", "markdown", "note", "proposal"), + ), + ( + "Data / analysis", + ("data", "analysis", "analyze", "csv", "spreadsheet", "sheet", "table", "chart", "dashboard", "metric", "metrics", "sql"), + ), + ( + "Development / ops", + ("code", "bug", "fix", "test", "debug", "api", "backend", "frontend", "database", "deploy", "docker", "service", "build"), + ), +) + + +def _objective_category(objective: str) -> str: + text = objective.lower() + for category, keywords in _CATEGORY_RULES: + if any(keyword in text for keyword in keywords): + return category + return "Other" + + class HistoryDB: def __init__(self, db_path: Path) -> None: self.db_path = db_path @@ -184,6 +217,131 @@ class HistoryDB: ).fetchone() return dict(totals) if totals else {} + def analytics(self) -> dict[str, Any]: + with self._connect() as conn: + rows = conn.execute( + """ + SELECT job_id, objective, status, steps, estimated_cost_usd, created_at + FROM jobs + ORDER BY created_at ASC, job_id ASC + """ + ).fetchall() + + total_jobs = 0 + finished_jobs = 0 + completed_jobs = 0 + failed_jobs = 0 + cancelled_jobs = 0 + steps_sum = 0 + steps_count = 0 + cost_sum = 0.0 + cost_count = 0 + by_category: dict[str, dict[str, Any]] = {} + by_day: dict[str, dict[str, Any]] = {} + + def _bucket(target: dict[str, dict[str, Any]], key: str) -> dict[str, Any]: + bucket = target.setdefault( + key, + { + "label": key, + "total_jobs": 0, + "finished_jobs": 0, + "completed_jobs": 0, + "failed_jobs": 0, + "cancelled_jobs": 0, + "steps_sum": 0, + "steps_count": 0, + "cost_sum": 0.0, + "cost_count": 0, + }, + ) + return bucket + + for row in rows: + total_jobs += 1 + status = str(row["status"] or "") + finished = status in _TERMINAL_STATUSES + completed = status == "completed" + objective = str(row["objective"] or "") + category = _objective_category(objective) + created_at = str(row["created_at"] or "") + day = created_at[:10] if len(created_at) >= 10 else created_at or "unknown" + + category_bucket = _bucket(by_category, category) + day_bucket = _bucket(by_day, day) + for bucket in (category_bucket, day_bucket): + bucket["total_jobs"] += 1 + + if not finished: + continue + + finished_jobs += 1 + if completed: + completed_jobs += 1 + elif status == "failed": + failed_jobs += 1 + elif status == "cancelled": + cancelled_jobs += 1 + + steps = row["steps"] + if steps is not None: + step_value = int(steps) + steps_sum += step_value + steps_count += 1 + for bucket in (category_bucket, day_bucket): + bucket["steps_sum"] += step_value + bucket["steps_count"] += 1 + + estimated_cost = row["estimated_cost_usd"] + if estimated_cost is not None: + cost_value = float(estimated_cost) + cost_sum += cost_value + cost_count += 1 + for bucket in (category_bucket, day_bucket): + bucket["cost_sum"] += cost_value + bucket["cost_count"] += 1 + + for bucket in (category_bucket, day_bucket): + bucket["finished_jobs"] += 1 + if completed: + bucket["completed_jobs"] += 1 + elif status == "failed": + bucket["failed_jobs"] += 1 + elif status == "cancelled": + bucket["cancelled_jobs"] += 1 + + def _finalize(bucket: dict[str, Any]) -> dict[str, Any]: + finished = bucket["finished_jobs"] + return { + "label": bucket["label"], + "total_jobs": bucket["total_jobs"], + "finished_jobs": finished, + "completed_jobs": bucket["completed_jobs"], + "failed_jobs": bucket["failed_jobs"], + "cancelled_jobs": bucket["cancelled_jobs"], + "success_rate": round((bucket["completed_jobs"] / finished) * 100, 2) if finished else 0.0, + "avg_steps": round(bucket["steps_sum"] / bucket["steps_count"], 2) if bucket["steps_count"] else None, + "avg_cost_usd": round(bucket["cost_sum"] / bucket["cost_count"], 6) if bucket["cost_count"] else None, + } + + category_rows = [_finalize(bucket) for bucket in by_category.values()] + category_rows.sort(key=lambda item: (-item["success_rate"], item["label"])) + day_rows = [_finalize(bucket) for bucket in by_day.values()] + day_rows.sort(key=lambda item: item["label"]) + + return { + "total_jobs": total_jobs, + "finished_jobs": finished_jobs, + "completed_jobs": completed_jobs, + "failed_jobs": failed_jobs, + "cancelled_jobs": cancelled_jobs, + "success_rate": round((completed_jobs / finished_jobs) * 100, 2) if finished_jobs else 0.0, + "avg_steps": round(steps_sum / steps_count, 2) if steps_count else None, + "avg_cost_usd": round(cost_sum / cost_count, 6) if cost_count else None, + "by_category": category_rows, + "timeline": day_rows, + } + def _row_to_job(self, row: sqlite3.Row) -> dict[str, Any]: disabled_tools: list[str] = [] try: diff --git a/src/task_manager.py b/src/task_manager.py index 0fa3157..7fc5c2e 100644 --- a/src/task_manager.py +++ b/src/task_manager.py @@ -351,6 +351,9 @@ class JobManager: stats["live_running_threads"] = sum(1 for job in self._running.values() if job.thread.is_alive()) return stats + def analytics(self) -> dict[str, Any]: + return self.db.analytics() + def _normalize_job_payload(self, job: dict[str, Any]) -> dict[str, Any]: response = job.get("response") if not isinstance(response, dict): diff --git a/src/ui_assets/monitoring.html b/src/ui_assets/monitoring.html index ae0dec4..8854028 100644 --- a/src/ui_assets/monitoring.html +++ b/src/ui_assets/monitoring.html @@ -21,6 +21,30 @@
+
+
+

Analytics

+
+
+
+
+
+
+

Success by Objective Category

+
+
+
+
+
+
+

Avg Steps / Cost Over Time

+
+
+
+
+
+
+
diff --git a/src/ui_assets/monitoring.js b/src/ui_assets/monitoring.js index 1df1af3..6d514f4 100644 --- a/src/ui_assets/monitoring.js +++ b/src/ui_assets/monitoring.js @@ -17,6 +17,12 @@ const replayPrevBtn = document.getElementById("replayPrevBtn"); const replayNextBtn = document.getElementById("replayNextBtn"); const replaySpeedEl = document.getElementById("replaySpeed"); const replaySeekEl = document.getElementById("replaySeek"); +const analyticsMetaEl = document.getElementById("analyticsMeta"); +const analyticsSummaryEl = document.getElementById("analyticsSummary"); +const analyticsCategorySummaryEl = document.getElementById("analyticsCategorySummary"); +const analyticsCategoriesEl = document.getElementById("analyticsCategories"); +const analyticsTrendSummaryEl = document.getElementById("analyticsTrendSummary"); +const analyticsTrendsEl = document.getElementById("analyticsTrends"); const state = { token: localStorage.getItem("screenjob_token") || "", @@ -35,6 +41,7 @@ const state = { } }; const manuallyClosedSockets = new WeakSet(); +const analyticsRefreshEvents = new Set(["job_finished", "job_failed", "job_rejected"]); tokenInput.value = state.token; function authHeaders() { @@ -66,6 +73,197 @@ function renderStats(stats) { `).join(""); } +function escapeHtml(value) { + return String(value ?? "").replace(/[&<>"']/g, (ch) => ({ + "&": "&", + "<": "<", + ">": ">", + '"': """, + "'": "'" + })[ch]); +} + +function formatNumber(value, digits = 2) { + const num = Number(value); + return Number.isFinite(num) ? num.toFixed(digits) : "—"; +} + +function formatCurrency(value, digits = 6) { + const num = Number(value); + return Number.isFinite(num) ? `$${num.toFixed(digits)}` : "—"; +} + +function formatPercent(value) { + const num = Number(value); + return Number.isFinite(num) ? `${num.toFixed(1)}%` : "—"; +} + +function formatDateLabel(value) { + const dt = new Date(value); + if (Number.isNaN(dt.getTime())) return String(value || "—"); + return dt.toLocaleDateString(undefined, { month: "short", day: "numeric" }); +} + +function renderMetricCard(label, value) { + return ` +
+
${escapeHtml(label)}
+
${escapeHtml(value)}
+
+ `; +} + +function renderLineChart(title, points, options = {}) { + const color = options.color || "#22d3ee"; + const valueLabel = options.valueLabel || ""; + const sourcePoints = Array.isArray(points) + ? points.filter((point) => Number.isFinite(Number(point.value))) + : []; + + if (!sourcePoints.length) { + return ` +
+
+
+
${escapeHtml(title)}
+
No data yet
+
+
+
+ `; + } + + const width = 640; + const height = 220; + const margin = { top: 20, right: 18, bottom: 34, left: 44 }; + const values = sourcePoints.map((point) => Number(point.value)); + const minValue = Math.min(...values); + const maxValue = Math.max(...values); + const span = maxValue - minValue || 1; + const chartWidth = width - margin.left - margin.right; + const chartHeight = height - margin.top - margin.bottom; + const xStep = sourcePoints.length > 1 ? chartWidth / (sourcePoints.length - 1) : 0; + const coords = sourcePoints.map((point, index) => ({ + x: margin.left + (index * xStep), + y: margin.top + ((maxValue - Number(point.value)) / span) * chartHeight, + })); + const linePath = coords.map((point, index) => `${index === 0 ? "M" : "L"} ${point.x} ${point.y}`).join(" "); + const baseline = height - margin.bottom; + const midIndex = Math.floor(sourcePoints.length / 2); + const xLabels = [ + { index: 0, label: sourcePoints[0].label }, + { index: midIndex, label: sourcePoints[midIndex].label }, + { index: sourcePoints.length - 1, label: sourcePoints[sourcePoints.length - 1].label }, + ].filter((item, index, array) => item.label && array.findIndex((candidate) => candidate.index === item.index) === index); + const minLabel = options.formatValue ? options.formatValue(minValue) : formatNumber(minValue, 2); + const maxLabel = options.formatValue ? options.formatValue(maxValue) : formatNumber(maxValue, 2); + const latest = sourcePoints[sourcePoints.length - 1]; + const latestValue = options.formatValue ? options.formatValue(latest.value) : formatNumber(latest.value, 2); + + return ` +
+
+
+
${escapeHtml(title)}
+
${escapeHtml(latestValue)}${valueLabel ? ` ${escapeHtml(valueLabel)}` : ""}
+
+
+
${escapeHtml(sourcePoints.length)} points
+
${escapeHtml(minLabel)} - ${escapeHtml(maxLabel)}
+
+
+ + ${Array.from({ length: 4 }, (_, idx) => { + const y = margin.top + (chartHeight / 3) * idx; + return ``; + }).join("")} + + + ${coords.map((point) => ` + + `).join("")} + ${escapeHtml(maxLabel)} + ${escapeHtml(minLabel)} + ${xLabels.map((item) => ` + ${escapeHtml(formatDateLabel(item.label))} + `).join("")} + +
+ `; +} + +function renderAnalytics(payload) { + const analytics = payload || {}; + const categories = Array.isArray(analytics.by_category) ? analytics.by_category : []; + const timeline = Array.isArray(analytics.timeline) ? analytics.timeline : []; + const finishedCategories = categories.filter((row) => Number(row.finished_jobs || 0) > 0); + + if (analyticsMetaEl) { + analyticsMetaEl.textContent = analytics.generated_at + ? `Updated ${new Date(analytics.generated_at).toLocaleString()}` + : "Historical snapshot"; + } + + analyticsSummaryEl.innerHTML = [ + renderMetricCard("Finished Jobs", analytics.finished_jobs || 0), + renderMetricCard("Success Rate", formatPercent(analytics.success_rate)), + renderMetricCard("Avg Steps", formatNumber(analytics.avg_steps, 1)), + renderMetricCard("Avg Cost", formatCurrency(analytics.avg_cost_usd)), + ].join(""); + + analyticsCategorySummaryEl.textContent = finishedCategories.length + ? `${finishedCategories.length} categories` + : "No finished jobs yet"; + + if (finishedCategories.length) { + analyticsCategoriesEl.innerHTML = finishedCategories.map((row) => { + const successRate = Number(row.success_rate || 0); + const completed = Number(row.completed_jobs || 0); + const finished = Number(row.finished_jobs || 0); + const total = Number(row.total_jobs || 0); + const avgSteps = row.avg_steps == null ? "—" : formatNumber(row.avg_steps, 1); + const avgCost = row.avg_cost_usd == null ? "—" : formatCurrency(row.avg_cost_usd); + return ` +
+
+
+
${escapeHtml(row.label || "Other")}
+
${finished} finished · ${completed} completed · ${total} total
+
+
+
${formatPercent(successRate)}
+
success rate
+
+
+
+
+
+
+
Avg steps: ${escapeHtml(avgSteps)}
+
Avg cost: ${escapeHtml(avgCost)}
+
+
+ `; + }).join(""); + } else { + analyticsCategoriesEl.innerHTML = ` +
+ No finished jobs yet. +
+ `; + } + + analyticsTrendSummaryEl.textContent = timeline.length ? `${timeline.length} days` : "No daily data yet"; + analyticsTrendsEl.innerHTML = [ + renderLineChart("Average steps per day", timeline.map((row) => ({ label: row.label, value: row.avg_steps })), { color: "#38bdf8" }), + renderLineChart("Average cost per day", timeline.map((row) => ({ label: row.label, value: row.avg_cost_usd })), { + color: "#34d399", + valueLabel: "USD", + formatValue: (value) => formatCurrency(value), + }), + ].join(""); +} + function renderJobs() { jobListEl.innerHTML = state.jobs.map((job) => { const active = job.job_id === state.selectedJobId; @@ -310,6 +508,11 @@ async function refreshStats() { renderStats(payload); } +async function refreshAnalytics() { + const payload = await api("/api/analytics"); + renderAnalytics(payload); +} + async function refreshJobDetail() { if (!state.selectedJobId) return; const [job, events, replay] = await Promise.all([ @@ -345,6 +548,9 @@ function connectWs() { } await refreshJobs(); await refreshStats(); + if (analyticsRefreshEvents.has(payload.event_type)) { + await refreshAnalytics(); + } } catch (err) { console.error(err); } @@ -362,6 +568,7 @@ function connectWs() { async function fullRefresh() { await refreshJobs(); await refreshStats(); + await refreshAnalytics(); await refreshJobDetail(); } diff --git a/tests/test_server_api.py b/tests/test_server_api.py index a3b422a..3fe8e45 100644 --- a/tests/test_server_api.py +++ b/tests/test_server_api.py @@ -9,6 +9,24 @@ import src.server as server_module from src.config import AppConfig +_TERMINAL_STATUSES = {"completed", "failed", "cancelled"} + + +def _objective_category(objective: str) -> str: + text = objective.lower() + if any(keyword in text for keyword in ("browser", "website", "amazon", "google", "login", "shopping", "checkout", "orders")): + return "Browser / web" + if any(keyword in text for keyword in ("file", "folder", "directory", "terminal", "shell", "command", "cli", "script", "git", "repo", "install", "pip", "npm")): + return "Files / terminal" + if any(keyword in text for keyword in ("write", "summary", "document", "docs", "report", "email", "message", "readme", "markdown")): + return "Writing / docs" + if any(keyword in text for keyword in ("data", "analysis", "csv", "spreadsheet", "sheet", "table", "chart", "dashboard", "metric", "sql")): + return "Data / analysis" + if any(keyword in text for keyword in ("code", "bug", "fix", "test", "debug", "api", "backend", "frontend", "database", "deploy", "docker", "service", "build")): + return "Development / ops" + return "Other" + + class FakeJobManager: def __init__(self, *, config: AppConfig, db: Any, broadcast: Any = None) -> None: self.config = config @@ -39,6 +57,7 @@ class FakeJobManager: artifacts_dir.mkdir(parents=True, exist_ok=True) screenshot_path = artifacts_dir / "screen_step_001.png" screenshot_path.write_bytes(b"not-a-real-png") + created_at = f"2026-05-27T00:00:{self._counter:02d}Z" self.last_submit_payload = { "objective": objective, "model": selected_model, @@ -57,6 +76,10 @@ class FakeJobManager: "objective": objective, "model": selected_model, "status": "running", + "created_at": created_at, + "started_at": created_at, + "ended_at": None, + "steps": 1, "result": "Running", "response": {"return": "Running", "data": None}, "return": "Running", @@ -149,6 +172,114 @@ class FakeJobManager: "live_running_threads": 0, } + def analytics(self) -> dict[str, Any]: + by_category: dict[str, dict[str, Any]] = {} + by_day: dict[str, dict[str, Any]] = {} + + def bucket(target: dict[str, dict[str, Any]], key: str) -> dict[str, Any]: + return target.setdefault( + key, + { + "label": key, + "total_jobs": 0, + "finished_jobs": 0, + "completed_jobs": 0, + "failed_jobs": 0, + "cancelled_jobs": 0, + "steps_sum": 0, + "steps_count": 0, + "cost_sum": 0.0, + "cost_count": 0, + }, + ) + + total_jobs = 0 + finished_jobs = 0 + completed_jobs = 0 + failed_jobs = 0 + cancelled_jobs = 0 + steps_sum = 0 + steps_count = 0 + cost_sum = 0.0 + cost_count = 0 + + for job in self._jobs.values(): + total_jobs += 1 + status = str(job.get("status") or "") + finished = status in _TERMINAL_STATUSES + category = _objective_category(str(job.get("objective") or "")) + day = str(job.get("created_at") or "")[:10] or "unknown" + + category_bucket = bucket(by_category, category) + day_bucket = bucket(by_day, day) + for item in (category_bucket, day_bucket): + item["total_jobs"] += 1 + + if not finished: + continue + + finished_jobs += 1 + if status == "completed": + completed_jobs += 1 + elif status == "failed": + failed_jobs += 1 + elif status == "cancelled": + cancelled_jobs += 1 + + steps_raw = job.get("steps") + if steps_raw is not None: + steps = int(steps_raw) + steps_sum += steps + steps_count += 1 + for item in (category_bucket, day_bucket): + item["steps_sum"] += steps + item["steps_count"] += 1 + + estimated_cost_raw = (job.get("usage") or {}).get("estimated_cost_usd") + if estimated_cost_raw is not None: + estimated_cost = float(estimated_cost_raw) + cost_sum += estimated_cost + cost_count += 1 + for item in (category_bucket, day_bucket): + item["cost_sum"] += estimated_cost + item["cost_count"] += 1 + + for item in (category_bucket, day_bucket): + item["finished_jobs"] += 1 + if status == "completed": + item["completed_jobs"] += 1 + elif status == "failed": + item["failed_jobs"] += 1 + elif status == "cancelled": + item["cancelled_jobs"] += 1 + + def finalize(item: dict[str, Any]) -> dict[str, Any]: + finished = item["finished_jobs"] + return { + "label": item["label"], + "total_jobs": item["total_jobs"], + "finished_jobs": finished, + "completed_jobs": item["completed_jobs"], + "failed_jobs": item["failed_jobs"], + "cancelled_jobs": item["cancelled_jobs"], + "success_rate": round((item["completed_jobs"] / finished) * 100, 2) if finished else 0.0, + "avg_steps": round(item["steps_sum"] / item["steps_count"], 2) if item["steps_count"] else None, + "avg_cost_usd": round(item["cost_sum"] / item["cost_count"], 6) if item["cost_count"] else None, + } + + return { + "total_jobs": total_jobs, + "finished_jobs": finished_jobs, + "completed_jobs": completed_jobs, + "failed_jobs": failed_jobs, + "cancelled_jobs": cancelled_jobs, + "success_rate": round((completed_jobs / finished_jobs) * 100, 2) if finished_jobs else 0.0, + "avg_steps": round(steps_sum / steps_count, 2) if steps_count else None, + "avg_cost_usd": round(cost_sum / cost_count, 6) if cost_count else None, + "by_category": sorted((finalize(item) for item in by_category.values()), key=lambda item: (-item["success_rate"], item["label"])), + "timeline": sorted((finalize(item) for item in by_day.values()), key=lambda item: item["label"]), + } + def _build_app(tmp_path: Path, monkeypatch: Any, disable_ui: bool = False): monkeypatch.setattr(server_module, "JobManager", FakeJobManager) @@ -276,12 +407,67 @@ def test_replay_endpoint_skips_visual_paths_outside_artifacts(tmp_path: Path, mo assert payload["total_frames"] == 1 +def test_analytics_endpoint_groups_by_category_and_time(tmp_path: Path, monkeypatch: Any) -> None: + app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False) + manager = app.state.manager + client = TestClient(app) + headers = {"Authorization": "Bearer test_token"} + + browser_completed = client.post("/api/jobs", headers=headers, json={"job": "Open amazon.de and checkout"}).json()["job_id"] + browser_failed = client.post("/api/jobs", headers=headers, json={"job": "Open website and login"}).json()["job_id"] + terminal_completed = client.post("/api/jobs", headers=headers, json={"job": "Run a shell command to inspect files"}).json()["job_id"] + + manager._jobs[browser_completed].update( + status="completed", + ended_at="2026-05-27T00:10:00Z", + steps=4, + created_at="2026-05-27T00:00:01Z", + usage={**manager._jobs[browser_completed]["usage"], "estimated_cost_usd": 0.12}, + ) + manager._jobs[browser_failed].update( + status="failed", + ended_at="2026-05-28T00:10:00Z", + steps=6, + created_at="2026-05-28T00:00:01Z", + usage={**manager._jobs[browser_failed]["usage"], "estimated_cost_usd": 0.24}, + ) + manager._jobs[terminal_completed].update( + status="completed", + ended_at="2026-05-28T00:15:00Z", + steps=10, + created_at="2026-05-28T00:00:02Z", + usage={**manager._jobs[terminal_completed]["usage"], "estimated_cost_usd": 0.05}, + ) + + analytics = client.get("/api/analytics", headers=headers) + assert analytics.status_code == 200 + payload = analytics.json() + + assert payload["total_jobs"] == 3 + assert payload["finished_jobs"] == 3 + assert payload["completed_jobs"] == 2 + assert payload["failed_jobs"] == 1 + assert payload["success_rate"] == 66.67 + assert payload["avg_steps"] == 6.67 + assert payload["avg_cost_usd"] == 0.136667 + + browser = next(row for row in payload["by_category"] if row["label"] == "Browser / web") + terminal = next(row for row in payload["by_category"] if row["label"] == "Files / terminal") + assert browser["finished_jobs"] == 2 + assert browser["success_rate"] == 50.0 + assert browser["avg_steps"] == 5.0 + assert terminal["success_rate"] == 100.0 + + assert [row["label"] for row in payload["timeline"]] == ["2026-05-27", "2026-05-28"] + + def test_ui_toggle(tmp_path: Path, monkeypatch: Any) -> None: app_enabled, _ = _build_app(tmp_path / "enabled", monkeypatch, disable_ui=False) client_enabled = TestClient(app_enabled) root_enabled = client_enabled.get("/") assert root_enabled.status_code == 200 assert "ScreenJob Monitor" in root_enabled.text + assert "Success by Objective Category" in root_enabled.text js_enabled = client_enabled.get("/ui/monitoring.js") assert js_enabled.status_code == 200 assert "const tokenInput" in js_enabled.text diff --git a/tests/test_storage.py b/tests/test_storage.py index 155ac7f..02da92c 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -72,3 +72,55 @@ def test_storage_response_fallback_uses_result_when_json_missing(tmp_path: Path) assert job is not None assert job["response"]["return"] == "Legacy result string" assert job["response"]["data"] is None + + +def test_history_db_analytics_groups_by_category_and_day(tmp_path: Path) -> None: + db = HistoryDB(tmp_path / "screenjob_test_analytics.db") + + db.create_job( + job_id="job_browser_ok", + objective="Open amazon.de and checkout", + model="gpt-5.4-mini", + created_at="2026-05-27T00:00:01Z", + safety_override=False, + disabled_tools=[], + ) + db.update_job("job_browser_ok", status="completed", steps=4, estimated_cost_usd=0.12) + + db.create_job( + job_id="job_browser_fail", + objective="Open website and login", + model="gpt-5.4-mini", + created_at="2026-05-28T00:00:01Z", + safety_override=False, + disabled_tools=[], + ) + db.update_job("job_browser_fail", status="failed", steps=6, estimated_cost_usd=0.24) + + db.create_job( + job_id="job_terminal_ok", + objective="Run a shell command to inspect files", + model="gpt-5.4-mini", + created_at="2026-05-28T00:00:02Z", + safety_override=False, + disabled_tools=[], + ) + db.update_job("job_terminal_ok", status="completed", steps=10, estimated_cost_usd=0.05) + + analytics = db.analytics() + assert analytics["total_jobs"] == 3 + assert analytics["finished_jobs"] == 3 + assert analytics["completed_jobs"] == 2 + assert analytics["failed_jobs"] == 1 + assert analytics["success_rate"] == 66.67 + assert analytics["avg_steps"] == 6.67 + assert analytics["avg_cost_usd"] == 0.136667 + + browser = next(row for row in analytics["by_category"] if row["label"] == "Browser / web") + terminal = next(row for row in analytics["by_category"] if row["label"] == "Files / terminal") + assert browser["finished_jobs"] == 2 + assert browser["success_rate"] == 50.0 + assert browser["avg_steps"] == 5.0 + assert terminal["success_rate"] == 100.0 + + assert [row["label"] for row in analytics["timeline"]] == ["2026-05-27", "2026-05-28"] diff --git a/todo.md b/todo.md index 7b75d22..0d54606 100644 --- a/todo.md +++ b/todo.md @@ -20,4 +20,4 @@ ## P3 - [x] Add Replay Mode; Ability to replay a session by reconstructing the screen from screenshots and overlaying tool calls and click and type events. -- [Idea] Add lightweight analytics dashboards (success rate by objective category, avg steps/cost over time). +- [x] Add lightweight analytics dashboards (success rate by objective category, avg steps/cost over time).