From 6f9eedcc7a5bf162a5402603e81f7d0eca2f7100 Mon Sep 17 00:00:00 2001 From: Space-Banane Date: Sun, 5 Apr 2026 19:48:00 +0200 Subject: [PATCH] reset --- .github/workflows/ci.yml | 23 ------ README.md | 69 +--------------- client/app.js | 159 ------------------------------------ client/index.html | 85 ------------------- client/styles.css | 108 ------------------------ pytest.ini | 3 - requirements-dev.txt | 2 - requirements.txt | 5 -- ruff.toml | 5 -- server/__init__.py | 1 - server/actions.py | 34 -------- server/config.py | 14 ---- server/grid.py | 136 ------------------------------ server/main.py | 133 ------------------------------ server/models.py | 67 --------------- server/planner.py | 70 ---------------- server/streamer.py | 38 --------- skill/__init__.py | 11 --- skill/agent_runner.py | 60 -------------- skill/clickthrough_skill.py | 98 ---------------------- tests/conftest.py | 29 ------- tests/test_agent_runner.py | 79 ------------------ tests/test_endpoints.py | 32 -------- tests/test_grid.py | 51 ------------ tests/test_planner.py | 32 -------- tests/test_streamer.py | 41 ---------- tests/test_ui.py | 12 --- 27 files changed, 1 insertion(+), 1396 deletions(-) delete mode 100644 .github/workflows/ci.yml delete mode 100644 client/app.js delete mode 100644 client/index.html delete mode 100644 client/styles.css delete mode 100644 pytest.ini delete mode 100644 requirements-dev.txt delete mode 100644 requirements.txt delete mode 100644 ruff.toml delete mode 100644 server/__init__.py delete mode 100644 server/actions.py delete mode 100644 server/config.py delete mode 100644 server/grid.py delete mode 100644 server/main.py delete mode 100644 server/models.py delete mode 100644 server/planner.py delete mode 100644 server/streamer.py delete mode 100644 skill/__init__.py delete mode 100644 skill/agent_runner.py delete mode 100644 skill/clickthrough_skill.py delete mode 100644 tests/conftest.py delete mode 100644 tests/test_agent_runner.py delete mode 100644 tests/test_endpoints.py delete mode 100644 tests/test_grid.py delete mode 100644 tests/test_planner.py delete mode 100644 tests/test_streamer.py delete mode 100644 tests/test_ui.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 35f817c..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: CI - -on: - push: {} - pull_request: {} - -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - name: Install runtime dependencies - run: python -m pip install --upgrade pip && pip install -r requirements.txt - - name: Install dev dependencies - run: pip install -r requirements-dev.txt - - name: Run lints - run: ruff check server skill tests - - name: Run tests - run: pytest diff --git a/README.md b/README.md index a77e716..5a2c84e 100644 --- a/README.md +++ b/README.md @@ -1,69 +1,2 @@ # Clickthrough - -Let an Agent interact with your Computer. - -`Clickthrough` is a proof-of-concept bridge between a vision-aware agent and a headless controller. The project is split into two halves: - -1. A Python server that accepts a static grid overlay (think of a screenshot broken into cells) and exposes lightweight endpoints to ask questions, plan actions, or even run pointer/keyboard events. -2. A **skill** that bundles the HTTP calls/intent construction so we can hardwire the same flow inside OpenClaw later. - -## Server surface (FastAPI) - -- `POST /grid/init`: Accepts a base64 screenshot plus the requested rows/columns, returns a `grid_id`, cell bounds, and helpful metadata. The grid is stored in-memory so the agent can reference cells by ID in later actions. -- `POST /grid/action`: Takes a plan (`grid_id`, optional target cell, and an action like `click`/`drag`/`type`) and returns a structured `ActionResult` with computed coordinates for tooling to consume. -- `GET /grid/{grid_id}/summary`: Returns both a heuristic description (`GridPlanner`) and a rich descriptor so the skill can summarize what it sees. -- `GET /grid/{grid_id}/history`: Streams back the action history for that grid so an agent or operator can audit what was done. -- `POST /grid/{grid_id}/plan`: Lets `GridPlanner` select the target and return a preview action plan without committing to it, so we can inspect coordinates before triggering events. -- `POST /grid/{grid_id}/refresh` + `GET /stream/screenshots`: Refresh the cached screenshot/metadata and broadcast the updated scene over a websocket so clients can redraw overlays in near real time. -- `GET /health`: A minimal health check for deployments. - -Vision metadata is kept on a per-grid basis, including history, layout dimensions, and any appended memo. Each `VisionGrid` also exposes a short textual summary so the skill layer can turn sensory data into sentences directly. - -## Skill layer (OpenClaw integration) - -The `skill/` package wraps the server calls and exposes helpers: - -- `ClickthroughSkill.describe_grid()` builds a grid session and returns the descriptor. -- `ClickthroughSkill.plan_action()` drives the `/grid/action` endpoint. -- `ClickthroughSkill.plan_with_planner()` calls `/grid/{grid_id}/plan`, so you can preview the `GridPlanner` suggestion before executing it. -- `ClickthroughSkill.grid_summary()` and `.grid_history()` surface the new metadata endpoints. -- `ClickthroughSkill.refresh_grid()` pushes a new screenshot and memo, triggering websocket listeners. -- `ClickthroughAgentRunner` simulates a tiny agent loop that asks the planner for a preview, executes the resulting action, and then gathers the summary/history so you can iterate on reasoning loops in tests. - -Future work can swap the stub runner for a full OpenClaw skill that keeps reasoning inside the agent and uses these primitives to steer the mouse/keyboard. - -## Screenshot streaming - -Capture loops can now talk to FastAPI in two ways: - -1. POST `/grid/{grid_id}/refresh` with fresh base64 screenshots and an optional memo; the server updates the cached grid metadata and broadcasts the change. -2. Open a websocket to `GET /stream/screenshots` (optionally passing `grid_id` as a query param) to receive realtime deltas whenever a refresh happens. Clients can use the descriptor/payload to redraw overlays or trigger new planner runs without polling. - -## Testing - -1. `python3 -m pip install -r requirements.txt` -2. `python3 -m pip install -r requirements-dev.txt` -3. `python3 -m pytest` - -The `tests/` suite covers grid construction, the FastAPI surface, and the skill/runner helpers. - -## Continuous Integration - -`.github/workflows/ci.yml` runs on pushes and PRs: - -- Checks out the repo and sets up Python 3.11. -- Installs dependencies (`requirements.txt` + `requirements-dev.txt`). -- Runs `ruff check` over the Python packages. -- Executes `pytest` to keep coverage high. - -## Control UI - -- `/ui/` serves a small control panel where you can bootstrap a grid from a base64 screenshot, ask the planner for a preview, execute clicks, refresh the screenshot, and watch the summary/history. -- Most traffic is HTTP: `/grid/init`, `/grid/{id}/plan`, `/grid/{id}/action`, `/grid/{id}/refresh`, `/grid/{id}/summary`, and `/grid/{id}/history`. Only the `/stream/screenshots` websocket pushes updates after a refresh so the overlay redraws. -- The FastAPI root now redirects to `/ui/` when the client assets are present, making the UI a lightweight entry point for demos or manual command-and-control work. - -## Next steps - -- Add OCR or UI heuristics so grid cells have meaningful labels before the agent reasons about them. -- Persist grids and histories in a lightweight store so long-running sessions survive restarts. -- Expand the UI to preview actions visually (perhaps overlaying cells on top of rendered screenshots). +Let an Agent interact with your Computer. \ No newline at end of file diff --git a/client/app.js b/client/app.js deleted file mode 100644 index 8fd6764..0000000 --- a/client/app.js +++ /dev/null @@ -1,159 +0,0 @@ -const gridForm = document.getElementById("grid-form"); -const descriptorEl = document.getElementById("descriptor"); -const gridMetaEl = document.getElementById("grid-meta"); -const summaryEl = document.getElementById("summary"); -const historyEl = document.getElementById("history"); -const planOutput = document.getElementById("plan-output"); -const preferredInput = document.getElementById("preferred-label"); -const refreshScreenshot = document.getElementById("refresh-screenshot"); -const refreshMemo = document.getElementById("refresh-memo"); -const logEl = document.getElementById("ws-log"); - -let currentGrid = null; -let lastPlan = null; -let ws = null; -let keepAliveId = null; - -const log = (message) => { - const timestamp = new Date().toLocaleTimeString(); - logEl.textContent = `[${timestamp}] ${message}\n${logEl.textContent}`; -}; - -const headers = { - "Content-Type": "application/json", -}; - -const subscribeToGrid = (gridId) => { - if (!gridId) return; - if (ws) { - ws.close(); - } - const protocol = window.location.protocol === "https:" ? "wss" : "ws"; - ws = new WebSocket(`${protocol}://${window.location.host}/stream/screenshots?grid_id=${gridId}`); - - ws.addEventListener("open", () => { - log(`WebSocket listening for grid ${gridId}`); - ws.send("ready"); - keepAliveId = setInterval(() => ws.send("ping"), 15000); - }); - - ws.addEventListener("message", (event) => { - log(`Update received → ${event.data}`); - }); - - ws.addEventListener("close", () => { - log("WebSocket disconnected"); - if (keepAliveId) { - clearInterval(keepAliveId); - keepAliveId = null; - } - }); -}; - -const updateDescriptor = (descriptor) => { - descriptorEl.textContent = JSON.stringify(descriptor, null, 2); - gridMetaEl.textContent = `Grid ${descriptor.grid_id} (${descriptor.rows}x${descriptor.columns}) · ${descriptor.cells.length} cells`; -}; - -const updateSummary = async () => { - if (!currentGrid) return; - const [summaryResponse, historyResponse] = await Promise.all([ - fetch(`/grid/${currentGrid}/summary`), - fetch(`/grid/${currentGrid}/history`), - ]); - - if (summaryResponse.ok) { - const payload = await summaryResponse.json(); - summaryEl.textContent = payload.summary; - } - - if (historyResponse.ok) { - const payload = await historyResponse.json(); - historyEl.textContent = JSON.stringify(payload.history, null, 2); - } -}; - -const initGrid = async (event) => { - event.preventDefault(); - const formData = new FormData(gridForm); - const payload = { - width: Number(formData.get("width")), - height: Number(formData.get("height")), - rows: Number(formData.get("rows")), - columns: Number(formData.get("columns")), - screenshot_base64: formData.get("screenshot"), - }; - const response = await fetch("/grid/init", { - method: "POST", - headers, - body: JSON.stringify(payload), - }); - const descriptor = await response.json(); - currentGrid = descriptor.grid_id; - updateDescriptor(descriptor); - await updateSummary(); - subscribeToGrid(currentGrid); - planOutput.textContent = "Plan preview will appear here."; - log(`Grid ${currentGrid} initialized.`); -}; - -document.getElementById("plan-button").addEventListener("click", async () => { - if (!currentGrid) { - log("Initialize a grid first."); - return; - } - const response = await fetch(`/grid/${currentGrid}/plan`, { - method: "POST", - headers, - body: JSON.stringify({ - preferred_label: preferredInput.value || null, - action: "click", - text: "ui-trigger", - }), - }); - const result = await response.json(); - lastPlan = result.plan; - planOutput.textContent = JSON.stringify(result, null, 2); -}); - -document.getElementById("run-action").addEventListener("click", async () => { - if (!lastPlan) { - log("Run the planner first."); - return; - } - const payload = { - grid_id: lastPlan.grid_id, - action: lastPlan.action, - target_cell: lastPlan.target_cell, - text: "from-ui", - comment: "UI action", - }; - const response = await fetch("/grid/action", { - method: "POST", - headers, - body: JSON.stringify(payload), - }); - const result = await response.json(); - log(`Action ${result.detail} at ${result.coordinates}`); - await updateSummary(); -}); - -document.getElementById("refresh-button").addEventListener("click", async () => { - if (!currentGrid) { - log("Start a grid first."); - return; - } - const payload = { - screenshot_base64: refreshScreenshot.value || "", - memo: refreshMemo.value || undefined, - }; - const response = await fetch(`/grid/${currentGrid}/refresh`, { - method: "POST", - headers, - body: JSON.stringify(payload), - }); - const data = await response.json(); - log(`Refresh acknowledged: ${JSON.stringify(data)}`); -}); - -gridForm.addEventListener("submit", initGrid); diff --git a/client/index.html b/client/index.html deleted file mode 100644 index 86206b9..0000000 --- a/client/index.html +++ /dev/null @@ -1,85 +0,0 @@ - - - - - Clickthrough Control - - - -
-
-

Clickthrough Control Panel

-

Most actions use HTTP; screenshots stream over WebSocket when refreshed.

-
- -
-

Grid bootstrap

-
- - - - - - -
-
- -
-

Grid status

-
No grid yet.
-

-      
- -
-

Planner & Actions

- -
- - -
-
Plan preview will appear here.
-
- -
-

Refresh Screenshot

- - - -

Refresh triggers /stream/screenshots so the UI can redraw.

-
- -
-

Summary & history

-
No data yet.
-
History will show here.
-
- -
-

Websocket log

-
Waiting for updates…
-
-
- - - diff --git a/client/styles.css b/client/styles.css deleted file mode 100644 index ec43300..0000000 --- a/client/styles.css +++ /dev/null @@ -1,108 +0,0 @@ -* { - box-sizing: border-box; -} - -body { - font-family: "Inter", "Segoe UI", system-ui, sans-serif; - margin: 0; - background: #121212; - color: #f5f5f5; -} - -main { - max-width: 960px; - margin: 0 auto; - padding: 24px; -} - -header { - text-align: center; - margin-bottom: 24px; -} - -header h1 { - margin-bottom: 8px; -} - -.card { - background: #1f1f1f; - padding: 16px; - border-radius: 16px; - margin-bottom: 16px; - box-shadow: 0 20px 45px rgba(0, 0, 0, 0.35); -} - -label { - display: block; - margin-bottom: 12px; -} - -label input, -label textarea { - width: 100%; - border-radius: 10px; - border: 1px solid #333; - background: #0f0f0f; - color: #f1f1f1; - padding: 8px 12px; - margin-top: 4px; - font-family: inherit; -} - -textarea { - font-family: inherit; -} - -button { - background: linear-gradient(135deg, #6d7cff, #3b82f6); - border: none; - padding: 10px 20px; - color: white; - border-radius: 999px; - font-weight: 600; - cursor: pointer; - transition: transform 0.15s ease; -} - -button:hover { - transform: translateY(-1px); -} - -.button-row { - display: flex; - gap: 12px; - flex-wrap: wrap; - margin-bottom: 12px; -} - -.monospace { - background: #0c0c0c; - border-radius: 12px; - padding: 12px; - border: 1px solid #333; - min-height: 80px; -} - -.note { - font-size: 0.9rem; - margin-top: 8px; - color: #b0b0b0; -} - -@media (min-width: 768px) { - label { - display: flex; - gap: 12px; - align-items: center; - } - - label input, - label textarea { - width: auto; - flex: 1; - } - - .stretch textarea { - width: 100%; - } -} diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 27eec68..0000000 --- a/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -testpaths = tests -python_files = test_*.py diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 6f7d025..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,2 +0,0 @@ -pytest>=8.0.0 -ruff>=0.0.1 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 02c6e7e..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -fastapi>=0.105.2 -uvicorn[standard]>=0.23.2 -pydantic>=2.8.2 -pydantic-settings>=2.5.0 -httpx>=0.28.1 diff --git a/ruff.toml b/ruff.toml deleted file mode 100644 index 4f9fac3..0000000 --- a/ruff.toml +++ /dev/null @@ -1,5 +0,0 @@ -[tool.ruff] -line-length = 100 -select = ["E", "F", "I", "S"] -target-version = "py311" -exclude = ["data", "__pycache__"] diff --git a/server/__init__.py b/server/__init__.py deleted file mode 100644 index ee978aa..0000000 --- a/server/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .main import app # noqa: F401 diff --git a/server/actions.py b/server/actions.py deleted file mode 100644 index 8c8f72b..0000000 --- a/server/actions.py +++ /dev/null @@ -1,34 +0,0 @@ -from __future__ import annotations - -from typing import Tuple - -from .models import ActionPayload, ActionResult - - -class ActionEngine: - def __init__(self, grid) -> None: - self.grid = grid - - def plan(self, payload: ActionPayload) -> ActionResult: - coords = self._resolve_coords(payload.target_cell) - detail = self._describe(payload, coords) - return ActionResult( - success=True, - detail=detail, - coordinates=coords, - payload={"comment": payload.comment or "", "text": payload.text or ""}, - ) - - def _resolve_coords(self, target_cell: str | None) -> Tuple[int, int] | None: - if not target_cell: - return None - return self.grid.resolve_cell_center(target_cell) - - def _describe( - self, payload: ActionPayload, coords: Tuple[int, int] | None - ) -> str: - cell_info = payload.target_cell or "free space" - location = f"@{cell_info}" if coords else "(no target)" - action_hint = payload.action.value - extra = f" text='{payload.text}'" if payload.text else "" - return f"Plan {action_hint} {location}{extra}" diff --git a/server/config.py b/server/config.py deleted file mode 100644 index bf5e282..0000000 --- a/server/config.py +++ /dev/null @@ -1,14 +0,0 @@ -from pathlib import Path - -from pydantic import ConfigDict -from pydantic_settings import BaseSettings - - -class ServerSettings(BaseSettings): - grid_rows: int = 4 - grid_cols: int = 4 - cell_margin_px: int = 4 - storage_dir: Path = Path("data/screenshots") - default_timeout: int = 10 - - model_config = ConfigDict(env_prefix="CLICKTHROUGH_", env_file=".env") diff --git a/server/grid.py b/server/grid.py deleted file mode 100644 index 720ef8c..0000000 --- a/server/grid.py +++ /dev/null @@ -1,136 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any, Dict, List, Tuple -import uuid - -from .actions import ActionEngine -from .config import ServerSettings -from .models import ( - ActionPayload, - ActionResult, - GridCellModel, - GridDescriptor, - GridInitRequest, -) - - -@dataclass -class _StoredCell: - model: GridCellModel - center: Tuple[int, int] - - -class VisionGrid: - def __init__(self, request: GridInitRequest, grid_id: str, rows: int, columns: int): - self.grid_id = grid_id - self.screenshot = request.screenshot_base64 - self.memo = request.memo - self.rows = rows - self.columns = columns - self.width = request.width - self.height = request.height - self.cells: Dict[str, _StoredCell] = {} - self._action_history: List[dict[str, Any]] = [] - self._engine = ActionEngine(self) - self._build_cells() - - def _build_cells(self, margin: int = 4) -> None: - cell_width = max(1, self.width // self.columns) - cell_height = max(1, self.height // self.rows) - - for row in range(self.rows): - for col in range(self.columns): - left = col * cell_width + margin - top = row * cell_height + margin - right = min(self.width - margin, (col + 1) * cell_width - margin) - bottom = min(self.height - margin, (row + 1) * cell_height - margin) - cell_id = f"{self.grid_id}-{row}-{col}" - bounds = (left, top, right, bottom) - center = ((left + right) // 2, (top + bottom) // 2) - cell = GridCellModel( - cell_id=cell_id, - row=row, - column=col, - bounds=bounds, - ) - self.cells[cell_id] = _StoredCell(model=cell, center=center) - - def describe(self) -> GridDescriptor: - return GridDescriptor( - grid_id=self.grid_id, - rows=self.rows, - columns=self.columns, - cells=[cell.model for cell in self.cells.values()], - metadata=self.metadata, - ) - - @property - def metadata(self) -> Dict[str, Any]: - return { - "memo": self.memo or "", - "width": self.width, - "height": self.height, - } - - def resolve_cell_center(self, cell_id: str) -> Tuple[int, int]: - cell = self.cells.get(cell_id) - if not cell: - raise KeyError(f"Unknown cell {cell_id}") - return cell.center - - def preview_action(self, payload: ActionPayload) -> ActionResult: - return self._engine.plan(payload) - - def apply_action(self, payload: ActionPayload) -> ActionResult: - result = self._engine.plan(payload) - self._action_history.append(result.model_dump()) - return result - - def update_screenshot(self, screenshot_base64: str, memo: str | None = None) -> None: - self.screenshot = screenshot_base64 - if memo: - self.memo = memo - - @property - def action_history(self) -> List[dict[str, Any]]: - return list(self._action_history) - - def summary(self) -> str: - last_action = self._action_history[-1] if self._action_history else None - last_summary = ( - f"Last action: {last_action.get('detail')}" if last_action else "No actions recorded yet" - ) - return ( - f"Grid {self.grid_id} ({self.rows}x{self.columns}) with {len(self.cells)} cells. {last_summary}." - ) - - -class GridManager: - def __init__(self, settings: ServerSettings): - self.settings = settings - self._grids: Dict[str, VisionGrid] = {} - - @property - def grid_count(self) -> int: - return len(self._grids) - - def create_grid(self, request: GridInitRequest) -> VisionGrid: - rows = request.rows or self.settings.grid_rows - columns = request.columns or self.settings.grid_cols - grid_id = uuid.uuid4().hex - grid = VisionGrid(request, grid_id, rows, columns) - self._grids[grid_id] = grid - return grid - - def get_grid(self, grid_id: str) -> VisionGrid: - try: - return self._grids[grid_id] - except KeyError as exc: - raise KeyError(f"Grid {grid_id} not found") from exc - - def get_history(self, grid_id: str) -> List[dict[str, Any]]: - return self.get_grid(grid_id).action_history - - def clear(self) -> None: - self._grids.clear() diff --git a/server/main.py b/server/main.py deleted file mode 100644 index 38baf55..0000000 --- a/server/main.py +++ /dev/null @@ -1,133 +0,0 @@ -import time -from pathlib import Path - -from fastapi import FastAPI, HTTPException, WebSocket, WebSocketDisconnect -from fastapi.responses import RedirectResponse -from fastapi.staticfiles import StaticFiles - -from .config import ServerSettings -from .grid import GridManager -from .models import ( - ActionPayload, - GridDescriptor, - GridInitRequest, - GridPlanRequest, - GridRefreshRequest, -) -from .planner import GridPlanner -from .streamer import ScreenshotStreamer - - -settings = ServerSettings() -manager = GridManager(settings) -planner = GridPlanner() -streamer = ScreenshotStreamer() - -app = FastAPI( - title="Clickthrough", - description="Grid-aware surface that lets an agent plan clicks, drags, and typing on a fake screenshot", - version="0.3.0", -) - -client_dir = Path(__file__).resolve().parent.parent / "client" -if client_dir.exists(): - app.mount("/ui", StaticFiles(directory=str(client_dir), html=True), name="ui") - - -@app.get("/") -async def root(): - if client_dir.exists(): - return RedirectResponse("/ui/") - return {"status": "ok", "grid_count": manager.grid_count} - - -@app.get("/health") -def health_check() -> dict[str, str]: - return {"status": "ok", "grid_count": str(manager.grid_count)} - - -@app.post("/grid/init", response_model=GridDescriptor) -def init_grid(request: GridInitRequest) -> GridDescriptor: - grid = manager.create_grid(request) - return grid.describe() - - -@app.post("/grid/action") -def apply_action(payload: ActionPayload): - try: - grid = manager.get_grid(payload.grid_id) - except KeyError as exc: - raise HTTPException(status_code=404, detail=str(exc)) from exc - return grid.apply_action(payload) - - -@app.get("/grid/{grid_id}/summary") -def grid_summary(grid_id: str): - try: - grid = manager.get_grid(grid_id) - except KeyError as exc: - raise HTTPException(status_code=404, detail=str(exc)) from exc - descriptor = grid.describe() - return { - "grid_id": grid_id, - "summary": planner.describe(descriptor), - "details": grid.summary(), - "descriptor": descriptor, - } - - -@app.get("/grid/{grid_id}/history") -def grid_history(grid_id: str): - try: - history = manager.get_history(grid_id) - except KeyError as exc: - raise HTTPException(status_code=404, detail=str(exc)) from exc - return {"grid_id": grid_id, "history": history} - - -@app.post("/grid/{grid_id}/plan") -def plan_grid(grid_id: str, request: GridPlanRequest): - try: - grid = manager.get_grid(grid_id) - except KeyError as exc: - raise HTTPException(status_code=404, detail=str(exc)) from exc - descriptor = grid.describe() - payload = planner.build_payload( - descriptor, - action=request.action, - preferred_label=request.preferred_label, - text=request.text, - comment=request.comment, - ) - result = grid.preview_action(payload) - return {"plan": payload.model_dump(), "result": result, "descriptor": descriptor} - - -@app.post("/grid/{grid_id}/refresh") -async def refresh_grid(grid_id: str, payload: GridRefreshRequest): - try: - grid = manager.get_grid(grid_id) - except KeyError as exc: - raise HTTPException(status_code=404, detail=str(exc)) from exc - grid.update_screenshot(payload.screenshot_base64, payload.memo) - descriptor = grid.describe() - await streamer.broadcast( - grid_id, - { - "grid_id": grid_id, - "timestamp": time.time(), - "descriptor": descriptor, - "screenshot_base64": payload.screenshot_base64, - }, - ) - return {"status": "updated", "grid_id": grid_id} - - -@app.websocket("/stream/screenshots") -async def stream_screenshots(websocket: WebSocket, grid_id: str | None = None): - key = await streamer.connect(websocket, grid_id) - try: - while True: - await websocket.receive_text() - except WebSocketDisconnect: - streamer.disconnect(websocket, key) diff --git a/server/models.py b/server/models.py deleted file mode 100644 index 141680e..0000000 --- a/server/models.py +++ /dev/null @@ -1,67 +0,0 @@ -from __future__ import annotations - -from enum import Enum -from typing import Any, Dict, List, Optional, Tuple - -from pydantic import BaseModel, Field - - -class ActionType(str, Enum): - CLICK = "click" - DOUBLE_CLICK = "double_click" - DRAG = "drag" - TYPE = "type" - SCROLL = "scroll" - - -class GridInitRequest(BaseModel): - width: int - height: int - screenshot_base64: str - rows: Optional[int] = None - columns: Optional[int] = None - memo: Optional[str] = None - - -class GridCellModel(BaseModel): - cell_id: str - row: int - column: int - bounds: Tuple[int, int, int, int] - label: Optional[str] = None - - -class GridDescriptor(BaseModel): - grid_id: str - rows: int - columns: int - cells: List[GridCellModel] - metadata: Dict[str, Any] = Field(default_factory=dict) - - -class ActionPayload(BaseModel): - grid_id: str - action: ActionType - target_cell: Optional[str] = None - text: Optional[str] = None - comment: Optional[str] = None - data: Dict[str, Any] = Field(default_factory=dict) - - -class ActionResult(BaseModel): - success: bool - detail: str - coordinates: Optional[Tuple[int, int]] = None - payload: Dict[str, Any] = Field(default_factory=dict) - - -class GridPlanRequest(BaseModel): - preferred_label: Optional[str] = None - action: ActionType = ActionType.CLICK - text: Optional[str] = None - comment: Optional[str] = None - - -class GridRefreshRequest(BaseModel): - screenshot_base64: str - memo: Optional[str] = None diff --git a/server/planner.py b/server/planner.py deleted file mode 100644 index e8f73bb..0000000 --- a/server/planner.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import annotations - -from math import hypot -from typing import Sequence - -from .models import ActionPayload, ActionType, GridCellModel, GridDescriptor - - -class GridPlanner: - """Helper that picks a grid cell using simple heuristics.""" - - def select_cell( - self, descriptor: GridDescriptor, preferred_label: str | None = None - ) -> GridCellModel | None: - if not descriptor.cells: - return None - - if preferred_label: - match = self._match_label(descriptor.cells, preferred_label) - if match: - return match - - center_point = self._grid_center(descriptor) - return min(descriptor.cells, key=lambda cell: self._distance(self._cell_center(cell), center_point)) - - def build_payload( - self, - descriptor: GridDescriptor, - action: ActionType = ActionType.CLICK, - preferred_label: str | None = None, - text: str | None = None, - comment: str | None = None, - ) -> ActionPayload: - target = self.select_cell(descriptor, preferred_label) - return ActionPayload( - grid_id=descriptor.grid_id, - action=action, - target_cell=target.cell_id if target else None, - text=text, - comment=comment, - ) - - def describe(self, descriptor: GridDescriptor) -> str: - cell_count = len(descriptor.cells) - return ( - f"Grid {descriptor.grid_id} is {descriptor.rows}x{descriptor.columns} with {cell_count} cells." - ) - - def _grid_center(self, descriptor: GridDescriptor) -> tuple[float, float]: - width = descriptor.metadata.get("width", 0) - height = descriptor.metadata.get("height", 0) - return (width / 2, height / 2) - - def _cell_center(self, cell: GridCellModel) -> tuple[float, float]: - left, top, right, bottom = cell.bounds - return ((left + right) / 2, (top + bottom) / 2) - - def _distance( - self, first: tuple[float, float], second: tuple[float, float] - ) -> float: - return hypot(first[0] - second[0], first[1] - second[1]) - - def _match_label( - self, cells: Sequence[GridCellModel], label: str - ) -> GridCellModel | None: - lowered = label.lower() - for cell in cells: - if cell.label and lowered in cell.label.lower(): - return cell - return None diff --git a/server/streamer.py b/server/streamer.py deleted file mode 100644 index 5990def..0000000 --- a/server/streamer.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import annotations - -from collections import defaultdict -from typing import Any, DefaultDict, Dict, List - -from fastapi import WebSocket -from websockets.exceptions import ConnectionClosedError - - -class ScreenshotStreamer: - """Keeps websocket listeners and pushes screenshot updates.""" - - def __init__(self) -> None: - self._listeners: DefaultDict[str, List[WebSocket]] = defaultdict(list) - - async def connect(self, websocket: WebSocket, grid_id: str | None = None) -> str: - await websocket.accept() - key = grid_id or "*" - self._listeners[key].append(websocket) - return key - - def disconnect(self, websocket: WebSocket, grid_key: str | None = None) -> None: - key = grid_key or "*" - sockets = self._listeners.get(key) - if not sockets: - return - if websocket in sockets: - sockets.remove(websocket) - if not sockets: - self._listeners.pop(key, None) - - async def broadcast(self, grid_id: str, payload: Dict[str, Any]) -> None: - listeners = list(self._listeners.get(grid_id, [])) + list(self._listeners.get("*", [])) - for websocket in listeners: - try: - await websocket.send_json(payload) - except (ConnectionClosedError, RuntimeError): - self.disconnect(websocket, grid_id) diff --git a/skill/__init__.py b/skill/__init__.py deleted file mode 100644 index dc65602..0000000 --- a/skill/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Utility helpers for the Clickthrough agent skill.""" - -from .agent_runner import AgentRunResult, ClickthroughAgentRunner -from .clickthrough_skill import ActionPlan, ClickthroughSkill - -__all__ = [ - "ClickthroughSkill", - "ActionPlan", - "ClickthroughAgentRunner", - "AgentRunResult", -] diff --git a/skill/agent_runner.py b/skill/agent_runner.py deleted file mode 100644 index bcd8e6f..0000000 --- a/skill/agent_runner.py +++ /dev/null @@ -1,60 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Dict - -from .clickthrough_skill import ActionPlan, ClickthroughSkill - - -@dataclass -class AgentRunResult: - summary: Dict[str, Any] - action: Dict[str, Any] - history: Dict[str, Any] - grid: Dict[str, Any] - plan_preview: Dict[str, Any] - - -class ClickthroughAgentRunner: - def __init__(self, skill: ClickthroughSkill) -> None: - self.skill = skill - - def run_once( - self, - screenshot_base64: str, - width: int, - height: int, - rows: int = 4, - columns: int = 4, - preferred_label: str | None = None, - action: str = "click", - text: str | None = None, - ) -> AgentRunResult: - grid = self.skill.describe_grid( - screenshot_base64=screenshot_base64, - width=width, - height=height, - rows=rows, - columns=columns, - ) - plan_response = self.skill.plan_with_planner( - grid_id=grid["grid_id"], - preferred_label=preferred_label, - action=action, - text=text, - ) - plan_payload = plan_response["plan"] - plan = ActionPlan( - grid_id=plan_payload["grid_id"], - target_cell=plan_payload.get("target_cell"), - action=plan_payload["action"], - text=plan_payload.get("text"), - ) - action_result = self.skill.plan_action(plan) - summary = self.skill.grid_summary(grid["grid_id"]) - history = self.skill.grid_history(grid["grid_id"]) - return AgentRunResult( - summary=summary, - action=action_result, - history=history, - grid=grid, - plan_preview=plan_response, - ) diff --git a/skill/clickthrough_skill.py b/skill/clickthrough_skill.py deleted file mode 100644 index 48891b7..0000000 --- a/skill/clickthrough_skill.py +++ /dev/null @@ -1,98 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Dict - -import httpx - - -@dataclass -class ActionPlan: - grid_id: str - target_cell: str | None - action: str - text: str | None = None - - -class ClickthroughSkill: - """Lightweight wrapper around the Clickthrough HTTP API.""" - - def __init__(self, server_url: str = "http://localhost:8000") -> None: - self._client = httpx.Client(base_url=server_url, timeout=10) - - def describe_grid( - self, - screenshot_base64: str, - width: int, - height: int, - rows: int = 4, - columns: int = 4, - ) -> Dict[str, Any]: - payload = { - "width": width, - "height": height, - "rows": rows, - "columns": columns, - "screenshot_base64": screenshot_base64, - "memo": "agent-powered grid", - } - response = self._client.post("/grid/init", json=payload) - response.raise_for_status() - return response.json() - - def plan_action(self, plan: ActionPlan) -> Dict[str, Any]: - payload = { - "grid_id": plan.grid_id, - "action": plan.action, - "target_cell": plan.target_cell, - "text": plan.text, - "comment": "skill-generated plan", - } - response = self._client.post("/grid/action", json=payload) - response.raise_for_status() - return response.json() - - def grid_summary(self, grid_id: str) -> Dict[str, Any]: - response = self._client.get(f"/grid/{grid_id}/summary") - response.raise_for_status() - return response.json() - - def grid_history(self, grid_id: str) -> Dict[str, Any]: - response = self._client.get(f"/grid/{grid_id}/history") - response.raise_for_status() - return response.json() - - def plan_with_planner( - self, - grid_id: str, - preferred_label: str | None = None, - action: str = "click", - text: str | None = None, - comment: str | None = None, - ) -> Dict[str, Any]: - payload = { - "preferred_label": preferred_label, - "action": action, - "text": text, - "comment": comment or "planner-generated", - } - response = self._client.post(f"/grid/{grid_id}/plan", json=payload) - response.raise_for_status() - return response.json() - - def refresh_grid(self, grid_id: str, screenshot_base64: str, memo: str | None = None) -> Dict[str, Any]: - payload = {"screenshot_base64": screenshot_base64, "memo": memo} - response = self._client.post(f"/grid/{grid_id}/refresh", json=payload) - response.raise_for_status() - return response.json() - - -if __name__ == "__main__": - import base64 - - dummy = base64.b64encode(b"fake-screenshot").decode() - skill = ClickthroughSkill() - grid = skill.describe_grid(dummy, width=800, height=600) - print("Grid cells:", len(grid.get("cells", []))) - if grid.get("cells"): - first_cell = grid["cells"][0]["cell_id"] - result = skill.plan_action(ActionPlan(grid_id=grid["grid_id"], target_cell=first_cell, action="click")) - print("Action result:", result) diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index f4579f5..0000000 --- a/tests/conftest.py +++ /dev/null @@ -1,29 +0,0 @@ -import base64 - -import pytest - -from server.main import manager - - -@pytest.fixture -def fake_screenshot() -> str: - """Return a reproducible base64 string representing a dummy screenshot.""" - return base64.b64encode(b"clickthrough-dummy").decode() - - -@pytest.fixture -def default_grid_request(fake_screenshot): - return { - "width": 640, - "height": 480, - "screenshot_base64": fake_screenshot, - "rows": 3, - "columns": 3, - } - - -@pytest.fixture(autouse=True) -def reset_manager_state(): - manager._grids.clear() - yield - manager._grids.clear() diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py deleted file mode 100644 index c1e3ae5..0000000 --- a/tests/test_agent_runner.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import Any, Dict - -from skill.agent_runner import ClickthroughAgentRunner -from skill.clickthrough_skill import ActionPlan, ClickthroughSkill - - -class DummySkill(ClickthroughSkill): - def __init__(self): - self.last_plan: ActionPlan | None = None - - def describe_grid( - self, - screenshot_base64: str, - width: int, - height: int, - rows: int = 4, - columns: int = 4, - ) -> Dict[str, Any]: - return { - "grid_id": "dummy-grid", - "cells": [ - {"cell_id": "dummy-grid-1", "label": "button", "bounds": [0, 0, 100, 100]}, - {"cell_id": "dummy-grid-2", "label": "target", "bounds": [100, 0, 200, 100]}, - ], - } - - def plan_with_planner( - self, - grid_id: str, - preferred_label: str | None = None, - action: str = "click", - text: str | None = None, - comment: str | None = None, - ) -> Dict[str, Any]: - cells = ["dummy-grid-1", "dummy-grid-2"] - if preferred_label == "target": - target = "dummy-grid-2" - else: - target = cells[len(cells) // 2] - plan = { - "grid_id": grid_id, - "target_cell": target, - "action": action, - "text": text, - "comment": comment, - } - return { - "plan": plan, - "result": {"success": True, "detail": "preview"}, - "descriptor": {"grid_id": grid_id}, - } - - def plan_action(self, plan: ActionPlan) -> Dict[str, Any]: - self.last_plan = plan - return {"success": True, "target_cell": plan.target_cell} - - def grid_summary(self, grid_id: str) -> Dict[str, Any]: - return {"grid_id": grid_id, "summary": "ok"} - - def grid_history(self, grid_id: str) -> Dict[str, Any]: - return {"grid_id": grid_id, "history": []} - - -def test_agent_runner_prefers_label(): - runner = ClickthroughAgentRunner(DummySkill()) - result = runner.run_once( - screenshot_base64="AA==", - width=120, - height=80, - preferred_label="target", - ) - assert result.action["target_cell"] == "dummy-grid-2" - assert result.summary["summary"] == "ok" - - -def test_agent_runner_defaults_to_center(): - runner = ClickthroughAgentRunner(DummySkill()) - result = runner.run_once(screenshot_base64="AA==", width=120, height=80) - assert result.action["target_cell"] == "dummy-grid-2" diff --git a/tests/test_endpoints.py b/tests/test_endpoints.py deleted file mode 100644 index 501a86d..0000000 --- a/tests/test_endpoints.py +++ /dev/null @@ -1,32 +0,0 @@ -from fastapi.testclient import TestClient - -from server.main import app, manager - -test_client = TestClient(app) - - -def test_plan_endpoint(default_grid_request): - init_response = test_client.post("/grid/init", json=default_grid_request) - grid_id = init_response.json()["grid_id"] - - plan_response = test_client.post( - f"/grid/{grid_id}/plan", - json={"preferred_label": None, "action": "click", "text": "hello"}, - ) - assert plan_response.status_code == 200 - payload = plan_response.json() - assert payload["plan"]["grid_id"] == grid_id - assert payload["result"]["success"] - - -def test_refresh_endpoint(default_grid_request): - init_response = test_client.post("/grid/init", json=default_grid_request) - grid_id = init_response.json()["grid_id"] - - refresh_response = test_client.post( - f"/grid/{grid_id}/refresh", json={"screenshot_base64": "AAA", "memo": "updated"} - ) - assert refresh_response.status_code == 200 - grid = manager.get_grid(grid_id) - assert grid.screenshot == "AAA" - assert grid.memo == "updated" diff --git a/tests/test_grid.py b/tests/test_grid.py deleted file mode 100644 index ddfe8be..0000000 --- a/tests/test_grid.py +++ /dev/null @@ -1,51 +0,0 @@ -from server.config import ServerSettings -from server.grid import GridManager -from server.models import ActionPayload, ActionType, GridInitRequest - - -def test_grid_creation_respects_dimensions(default_grid_request): - settings = ServerSettings(grid_rows=2, grid_cols=2) - manager = GridManager(settings) - request = GridInitRequest(**default_grid_request) - grid = manager.create_grid(request) - - descriptor = grid.describe() - assert descriptor.grid_id - assert descriptor.rows == 3 - assert descriptor.columns == 3 - assert len(descriptor.cells) == 9 - assert descriptor.metadata.get("width") == 640 - assert descriptor.metadata.get("height") == 480 - - -def test_grid_action_records_history(default_grid_request): - manager = GridManager(ServerSettings()) - request = GridInitRequest(**default_grid_request) - grid = manager.create_grid(request) - descriptor = grid.describe() - target_cell = descriptor.cells[0].cell_id - - payload = ActionPayload( - grid_id=descriptor.grid_id, - action=ActionType.CLICK, - target_cell=target_cell, - comment="click test", - ) - result = grid.apply_action(payload) - - assert result.success - assert result.coordinates is not None - assert grid.action_history[-1]["coordinates"] == result.coordinates - - -def test_manager_get_grid_missing(default_grid_request): - manager = GridManager(ServerSettings()) - request = GridInitRequest(**default_grid_request) - _ = manager.create_grid(request) - - try: - manager.get_grid("does-not-exist") - found = True - except KeyError: - found = False - assert not found diff --git a/tests/test_planner.py b/tests/test_planner.py deleted file mode 100644 index 72a25ea..0000000 --- a/tests/test_planner.py +++ /dev/null @@ -1,32 +0,0 @@ -from server.config import ServerSettings -from server.grid import GridManager -from server.planner import GridPlanner -from server.models import ActionType, GridInitRequest - - -def test_planner_preferred_label(default_grid_request): - settings = ServerSettings() - manager = GridManager(settings) - request = GridInitRequest(**default_grid_request) - grid = manager.create_grid(request) - descriptor = grid.describe() - descriptor.cells[0].label = "target" - - planner = GridPlanner() - payload = planner.build_payload(descriptor, preferred_label="target", action=ActionType.CLICK) - - assert payload.target_cell == descriptor.cells[0].cell_id - - -def test_planner_falls_back_to_center(default_grid_request): - settings = ServerSettings() - manager = GridManager(settings) - request = GridInitRequest(**default_grid_request) - grid = manager.create_grid(request) - descriptor = grid.describe() - - planner = GridPlanner() - payload = planner.build_payload(descriptor, action=ActionType.CLICK) - - assert payload.target_cell is not None - assert payload.grid_id == descriptor.grid_id diff --git a/tests/test_streamer.py b/tests/test_streamer.py deleted file mode 100644 index a65239c..0000000 --- a/tests/test_streamer.py +++ /dev/null @@ -1,41 +0,0 @@ -import asyncio - -from server.streamer import ScreenshotStreamer - - -class DummyWebSocket: - def __init__(self): - self.sent = [] - self.accepted = False - - async def accept(self) -> None: - self.accepted = True - - async def send_json(self, payload): - self.sent.append(payload) - - -def test_streamer_broadcasts_to_grid(): - streamer = ScreenshotStreamer() - socket = DummyWebSocket() - - async def scenario(): - key = await streamer.connect(socket, "grid-123") - await streamer.broadcast("grid-123", {"frame": 1}) - streamer.disconnect(socket, key) - - asyncio.run(scenario()) - assert socket.sent == [{"frame": 1}] - - -def test_streamer_wildcard_listener_receives_updates(): - streamer = ScreenshotStreamer() - socket = DummyWebSocket() - - async def scenario(): - key = await streamer.connect(socket, None) - await streamer.broadcast("grid-456", {"frame": 2}) - streamer.disconnect(socket, key) - - asyncio.run(scenario()) - assert socket.sent == [{"frame": 2}] diff --git a/tests/test_ui.py b/tests/test_ui.py deleted file mode 100644 index e3222ba..0000000 --- a/tests/test_ui.py +++ /dev/null @@ -1,12 +0,0 @@ -from fastapi.testclient import TestClient - -from server.main import app - - -test_client = TestClient(app) - - -def test_ui_root_serves_index(): - response = test_client.get("/ui/") - assert response.status_code == 200 - assert "Clickthrough Control" in response.text