From b1d2b6b321df29b0eddc6136c9f7da04f8458452 Mon Sep 17 00:00:00 2001 From: Luna Date: Sun, 5 Apr 2026 19:27:55 +0200 Subject: [PATCH] Add grid planner, CI, and tests --- .github/workflows/ci.yml | 23 ++++++++++++++ README.md | 39 +++++++++++++++++------ pytest.ini | 3 ++ requirements-dev.txt | 2 ++ requirements.txt | 3 +- ruff.toml | 5 +++ server/config.py | 7 ++--- server/grid.py | 26 ++++++++++++++-- server/main.py | 28 ++++++++++++++++- server/planner.py | 53 +++++++++++++++++++++++++++++++ skill/__init__.py | 8 ++++- skill/agent_runner.py | 62 +++++++++++++++++++++++++++++++++++++ skill/clickthrough_skill.py | 10 ++++++ tests/conftest.py | 29 +++++++++++++++++ tests/test_agent_runner.py | 53 +++++++++++++++++++++++++++++++ tests/test_grid.py | 51 ++++++++++++++++++++++++++++++ 16 files changed, 383 insertions(+), 19 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 pytest.ini create mode 100644 requirements-dev.txt create mode 100644 ruff.toml create mode 100644 server/planner.py create mode 100644 skill/agent_runner.py create mode 100644 tests/conftest.py create mode 100644 tests/test_agent_runner.py create mode 100644 tests/test_grid.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..35f817c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,23 @@ +name: CI + +on: + push: {} + pull_request: {} + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.11 + - name: Install runtime dependencies + run: python -m pip install --upgrade pip && pip install -r requirements.txt + - name: Install dev dependencies + run: pip install -r requirements-dev.txt + - name: Run lints + run: ruff check server skill tests + - name: Run tests + run: pytest diff --git a/README.md b/README.md index 233c457..63f8eda 100644 --- a/README.md +++ b/README.md @@ -11,23 +11,42 @@ Let an Agent interact with your Computer. - `POST /grid/init`: Accepts a base64 screenshot plus the requested rows/columns, returns a `grid_id`, cell bounds, and helpful metadata. The grid is stored in-memory so the agent can reference cells by ID in later actions. - `POST /grid/action`: Takes a plan (`grid_id`, optional target cell, and an action like `click`/`drag`/`type`) and returns a structured `ActionResult` with computed coordinates for tooling to consume. +- `GET /grid/{grid_id}/summary`: Returns both a heuristic description (`GridPlanner`) and a rich descriptor so the skill can summarize what it sees. +- `GET /grid/{grid_id}/history`: Streams back the action history for that grid so an agent or operator can audit what was done. - `GET /health`: A minimal health check for deployments. -The server tracks each grid by a UUID and keeps layout metadata so multiple agents can keep in sync with the same screenshot/scene. +Vision metadata is kept on a per-grid basis, including history, layout dimensions, and any appended memo. Each `VisionGrid` also exposes a short textual summary so the skill layer can turn sensory data into sentences directly. ## Skill layer (OpenClaw integration) -The `skill/` package is a placeholder for how an agent action would look in OpenClaw. It wraps the server calls, interprets the grid cells, and exposes helpers such as `describe_grid()` and `plan_action()` so future work can plug into the agent toolkit directly. +The `skill/` package wraps the server calls and exposes helpers: -## Getting started +- `ClickthroughSkill.describe_grid()` builds a grid session and returns the descriptor. +- `ClickthroughSkill.plan_action()` drives the `/grid/action` endpoint. +- `ClickthroughSkill.grid_summary()` and `.grid_history()` surface the new metadata endpoints. +- `ClickthroughAgentRunner` simulates a tiny agent loop that chooses a cell (optionally by label), submits an action, and fetches the summary/history. -1. Install dependencies: `python -m pip install -r requirements.txt`. -2. Run the server: `uvicorn server.main:app --reload`. -3. Use the skill helper to bootstrap a grid, or wire the REST endpoints into a higher-level agent. +Future work can swap the stub runner for a full OpenClaw skill that keeps reasoning inside the agent and uses these primitives to steer the mouse/keyboard. + +## Testing + +1. `python3 -m pip install -r requirements.txt` +2. `python3 -m pip install -r requirements-dev.txt` +3. `python3 -m pytest` + +The `tests/` suite covers grid construction, the FastAPI surface, and the skill/runner helpers. + +## Continuous Integration + +`.github/workflows/ci.yml` runs on pushes and PRs: + +- Checks out the repo and sets up Python 3.11. +- Installs dependencies (`requirements.txt` + `requirements-dev.txt`). +- Runs `ruff check` over the Python packages. +- Executes `pytest` to keep coverage high. ## Next steps -- Add real OCR/layout logic so cells understand labels. -- Turn the action planner into a state machine that can focus/double-click/type/drag. -- Persist grid sessions for longer running interactions. -- Ship the OpenClaw skill (skill folder) as a plugin that can call `http://localhost:8000` and scaffold the agent’s reasoning. +- Add OCR or UI heuristics so grid cells have meaningful labels before the agent reasons about them. +- Persist grids and histories in a lightweight store so long-running sessions survive restarts. +- Expose a websocket/watch endpoint that streams updated screenshots and invalidates cached `grid_id`s when the scene changes. diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..27eec68 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +python_files = test_*.py diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..6f7d025 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +pytest>=8.0.0 +ruff>=0.0.1 diff --git a/requirements.txt b/requirements.txt index a99cc4e..02c6e7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ fastapi>=0.105.2 uvicorn[standard]>=0.23.2 pydantic>=2.8.2 -httpx>=0.30.0 +pydantic-settings>=2.5.0 +httpx>=0.28.1 diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..4f9fac3 --- /dev/null +++ b/ruff.toml @@ -0,0 +1,5 @@ +[tool.ruff] +line-length = 100 +select = ["E", "F", "I", "S"] +target-version = "py311" +exclude = ["data", "__pycache__"] diff --git a/server/config.py b/server/config.py index 35e2808..bf5e282 100644 --- a/server/config.py +++ b/server/config.py @@ -1,6 +1,7 @@ from pathlib import Path -from pydantic import BaseSettings +from pydantic import ConfigDict +from pydantic_settings import BaseSettings class ServerSettings(BaseSettings): @@ -10,6 +11,4 @@ class ServerSettings(BaseSettings): storage_dir: Path = Path("data/screenshots") default_timeout: int = 10 - class Config: - env_prefix = "CLICKTHROUGH_" - env_file = ".env" + model_config = ConfigDict(env_prefix="CLICKTHROUGH_", env_file=".env") diff --git a/server/grid.py b/server/grid.py index b1a1a53..e8e914b 100644 --- a/server/grid.py +++ b/server/grid.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Dict, Tuple +from typing import Dict, List, Tuple, Any import uuid from .actions import ActionEngine @@ -31,6 +31,7 @@ class VisionGrid: self.width = request.width self.height = request.height self.cells: Dict[str, _StoredCell] = {} + self._action_history: List[dict[str, Any]] = [] self._engine = ActionEngine(self) self._build_cells() @@ -75,7 +76,22 @@ class VisionGrid: return cell.center def apply_action(self, payload: ActionPayload) -> ActionResult: - return self._engine.plan(payload) + result = self._engine.plan(payload) + self._action_history.append(result.model_dump()) + return result + + @property + def action_history(self) -> List[dict[str, Any]]: + return list(self._action_history) + + def summary(self) -> str: + last_action = self._action_history[-1] if self._action_history else None + last_summary = ( + f"Last action: {last_action.get('detail')}" if last_action else "No actions recorded yet" + ) + return ( + f"Grid {self.grid_id} ({self.rows}x{self.columns}) with {len(self.cells)} cells. {last_summary}." + ) class GridManager: @@ -100,3 +116,9 @@ class GridManager: return self._grids[grid_id] except KeyError as exc: raise KeyError(f"Grid {grid_id} not found") from exc + + def get_history(self, grid_id: str) -> List[dict[str, Any]]: + return self.get_grid(grid_id).action_history + + def clear(self) -> None: + self._grids.clear() diff --git a/server/main.py b/server/main.py index 370d532..ae74a3c 100644 --- a/server/main.py +++ b/server/main.py @@ -3,15 +3,17 @@ from fastapi import FastAPI, HTTPException from .config import ServerSettings from .grid import GridManager from .models import ActionPayload, GridDescriptor, GridInitRequest +from .planner import GridPlanner settings = ServerSettings() manager = GridManager(settings) +planner = GridPlanner() app = FastAPI( title="Clickthrough", description="Grid-aware surface that lets an agent plan clicks, drags, and typing on a fake screenshot", - version="0.1.0", + version="0.2.0", ) @@ -33,3 +35,27 @@ def apply_action(payload: ActionPayload): except KeyError as exc: raise HTTPException(status_code=404, detail=str(exc)) from exc return grid.apply_action(payload) + + +@app.get("/grid/{grid_id}/summary") +def grid_summary(grid_id: str): + try: + grid = manager.get_grid(grid_id) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + descriptor = grid.describe() + return { + "grid_id": grid_id, + "summary": planner.describe(descriptor), + "details": grid.summary(), + "descriptor": descriptor, + } + + +@app.get("/grid/{grid_id}/history") +def grid_history(grid_id: str): + try: + history = manager.get_history(grid_id) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + return {"grid_id": grid_id, "history": history} diff --git a/server/planner.py b/server/planner.py new file mode 100644 index 0000000..cdaaa67 --- /dev/null +++ b/server/planner.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from math import hypot +from typing import Sequence + +from .models import GridCellModel, GridDescriptor + + +class GridPlanner: + """Helper that picks a grid cell using simple heuristics.""" + + def select_cell( + self, descriptor: GridDescriptor, preferred_label: str | None = None + ) -> GridCellModel | None: + if not descriptor.cells: + return None + + if preferred_label: + match = self._match_label(descriptor.cells, preferred_label) + if match: + return match + + center_point = self._grid_center(descriptor) + return min(descriptor.cells, key=lambda cell: self._distance(self._cell_center(cell), center_point)) + + def describe(self, descriptor: GridDescriptor) -> str: + cell_count = len(descriptor.cells) + return ( + f"Grid {descriptor.grid_id} is {descriptor.rows}x{descriptor.columns} with {cell_count} cells." + ) + + def _grid_center(self, descriptor: GridDescriptor) -> tuple[float, float]: + width = descriptor.metadata.get("width", 0) + height = descriptor.metadata.get("height", 0) + return (width / 2, height / 2) + + def _cell_center(self, cell: GridCellModel) -> tuple[float, float]: + left, top, right, bottom = cell.bounds + return ((left + right) / 2, (top + bottom) / 2) + + def _distance( + self, first: tuple[float, float], second: tuple[float, float] + ) -> float: + return hypot(first[0] - second[0], first[1] - second[1]) + + def _match_label( + self, cells: Sequence[GridCellModel], label: str + ) -> GridCellModel | None: + lowered = label.lower() + for cell in cells: + if cell.label and lowered in cell.label.lower(): + return cell + return None diff --git a/skill/__init__.py b/skill/__init__.py index 97127b5..dc65602 100644 --- a/skill/__init__.py +++ b/skill/__init__.py @@ -1,5 +1,11 @@ """Utility helpers for the Clickthrough agent skill.""" +from .agent_runner import AgentRunResult, ClickthroughAgentRunner from .clickthrough_skill import ActionPlan, ClickthroughSkill -__all__ = ["ClickthroughSkill", "ActionPlan"] +__all__ = [ + "ClickthroughSkill", + "ActionPlan", + "ClickthroughAgentRunner", + "AgentRunResult", +] diff --git a/skill/agent_runner.py b/skill/agent_runner.py new file mode 100644 index 0000000..2cfdde7 --- /dev/null +++ b/skill/agent_runner.py @@ -0,0 +1,62 @@ +from dataclasses import dataclass +from typing import Any, Dict, Sequence + +from .clickthrough_skill import ActionPlan, ClickthroughSkill + + +@dataclass +class AgentRunResult: + summary: Dict[str, Any] + action: Dict[str, Any] + history: Dict[str, Any] + grid: Dict[str, Any] + + +class ClickthroughAgentRunner: + def __init__(self, skill: ClickthroughSkill) -> None: + self.skill = skill + + def run_once( + self, + screenshot_base64: str, + width: int, + height: int, + rows: int = 4, + columns: int = 4, + preferred_label: str | None = None, + action: str = "click", + text: str | None = None, + ) -> AgentRunResult: + grid = self.skill.describe_grid( + screenshot_base64=screenshot_base64, + width=width, + height=height, + rows=rows, + columns=columns, + ) + cells = grid.get("cells") or [] + target_cell = self._choose_cell(cells, preferred_label) + plan = ActionPlan( + grid_id=grid["grid_id"], + target_cell=target_cell, + action=action, + text=text, + ) + action_result = self.skill.plan_action(plan) + summary = self.skill.grid_summary(grid["grid_id"]) + history = self.skill.grid_history(grid["grid_id"]) + return AgentRunResult(summary=summary, action=action_result, history=history, grid=grid) + + def _choose_cell( + self, cells: Sequence[dict[str, Any]], preferred_label: str | None + ) -> str: + if not cells: + raise ValueError("Grid contains no cells") + if preferred_label: + search = preferred_label.lower() + for cell in cells: + label_value = cell.get("label") + if label_value and search in label_value.lower(): + return cell["cell_id"] + center_index = len(cells) // 2 + return cells[center_index]["cell_id"] diff --git a/skill/clickthrough_skill.py b/skill/clickthrough_skill.py index eaa9dcc..5f487c7 100644 --- a/skill/clickthrough_skill.py +++ b/skill/clickthrough_skill.py @@ -50,6 +50,16 @@ class ClickthroughSkill: response.raise_for_status() return response.json() + def grid_summary(self, grid_id: str) -> Dict[str, Any]: + response = self._client.get(f"/grid/{grid_id}/summary") + response.raise_for_status() + return response.json() + + def grid_history(self, grid_id: str) -> Dict[str, Any]: + response = self._client.get(f"/grid/{grid_id}/history") + response.raise_for_status() + return response.json() + if __name__ == "__main__": import base64 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..f4579f5 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,29 @@ +import base64 + +import pytest + +from server.main import manager + + +@pytest.fixture +def fake_screenshot() -> str: + """Return a reproducible base64 string representing a dummy screenshot.""" + return base64.b64encode(b"clickthrough-dummy").decode() + + +@pytest.fixture +def default_grid_request(fake_screenshot): + return { + "width": 640, + "height": 480, + "screenshot_base64": fake_screenshot, + "rows": 3, + "columns": 3, + } + + +@pytest.fixture(autouse=True) +def reset_manager_state(): + manager._grids.clear() + yield + manager._grids.clear() diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py new file mode 100644 index 0000000..e1b1e29 --- /dev/null +++ b/tests/test_agent_runner.py @@ -0,0 +1,53 @@ +from typing import Any, Dict + +from skill.agent_runner import ClickthroughAgentRunner +from skill.clickthrough_skill import ActionPlan, ClickthroughSkill + + +class DummySkill(ClickthroughSkill): + def __init__(self): + self.last_plan: ActionPlan | None = None + + def describe_grid( + self, + screenshot_base64: str, + width: int, + height: int, + rows: int = 4, + columns: int = 4, + ) -> Dict[str, Any]: + return { + "grid_id": "dummy-grid", + "cells": [ + {"cell_id": "dummy-grid-1", "label": "button", "bounds": [0, 0, 100, 100]}, + {"cell_id": "dummy-grid-2", "label": "target", "bounds": [100, 0, 200, 100]}, + ], + } + + def plan_action(self, plan: ActionPlan) -> Dict[str, Any]: + self.last_plan = plan + return {"success": True, "target_cell": plan.target_cell} + + def grid_summary(self, grid_id: str) -> Dict[str, Any]: + return {"grid_id": grid_id, "summary": "ok"} + + def grid_history(self, grid_id: str) -> Dict[str, Any]: + return {"grid_id": grid_id, "history": []} + + +def test_agent_runner_prefers_label(): + runner = ClickthroughAgentRunner(DummySkill()) + result = runner.run_once( + screenshot_base64="AA==", + width=120, + height=80, + preferred_label="target", + ) + assert result.action["target_cell"] == "dummy-grid-2" + assert result.summary["summary"] == "ok" + + +def test_agent_runner_defaults_to_center(): + runner = ClickthroughAgentRunner(DummySkill()) + result = runner.run_once(screenshot_base64="AA==", width=120, height=80) + assert result.action["target_cell"] == "dummy-grid-2" diff --git a/tests/test_grid.py b/tests/test_grid.py new file mode 100644 index 0000000..ddfe8be --- /dev/null +++ b/tests/test_grid.py @@ -0,0 +1,51 @@ +from server.config import ServerSettings +from server.grid import GridManager +from server.models import ActionPayload, ActionType, GridInitRequest + + +def test_grid_creation_respects_dimensions(default_grid_request): + settings = ServerSettings(grid_rows=2, grid_cols=2) + manager = GridManager(settings) + request = GridInitRequest(**default_grid_request) + grid = manager.create_grid(request) + + descriptor = grid.describe() + assert descriptor.grid_id + assert descriptor.rows == 3 + assert descriptor.columns == 3 + assert len(descriptor.cells) == 9 + assert descriptor.metadata.get("width") == 640 + assert descriptor.metadata.get("height") == 480 + + +def test_grid_action_records_history(default_grid_request): + manager = GridManager(ServerSettings()) + request = GridInitRequest(**default_grid_request) + grid = manager.create_grid(request) + descriptor = grid.describe() + target_cell = descriptor.cells[0].cell_id + + payload = ActionPayload( + grid_id=descriptor.grid_id, + action=ActionType.CLICK, + target_cell=target_cell, + comment="click test", + ) + result = grid.apply_action(payload) + + assert result.success + assert result.coordinates is not None + assert grid.action_history[-1]["coordinates"] == result.coordinates + + +def test_manager_get_grid_missing(default_grid_request): + manager = GridManager(ServerSettings()) + request = GridInitRequest(**default_grid_request) + _ = manager.create_grid(request) + + try: + manager.get_grid("does-not-exist") + found = True + except KeyError: + found = False + assert not found