diff --git a/README.md b/README.md index 5a2c84e..233c457 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,33 @@ # Clickthrough -Let an Agent interact with your Computer. \ No newline at end of file + +Let an Agent interact with your Computer. + +`Clickthrough` is a proof-of-concept bridge between a vision-aware agent and a headless controller. The project is split into two halves: + +1. A Python server that accepts a static grid overlay (think of a screenshot broken into cells) and exposes lightweight endpoints to ask questions, plan actions, or even run pointer/keyboard events. +2. A **skill** that bundles the HTTP calls/intent construction so we can hardwire the same flow inside OpenClaw later. + +## Server surface (FastAPI) + +- `POST /grid/init`: Accepts a base64 screenshot plus the requested rows/columns, returns a `grid_id`, cell bounds, and helpful metadata. The grid is stored in-memory so the agent can reference cells by ID in later actions. +- `POST /grid/action`: Takes a plan (`grid_id`, optional target cell, and an action like `click`/`drag`/`type`) and returns a structured `ActionResult` with computed coordinates for tooling to consume. +- `GET /health`: A minimal health check for deployments. + +The server tracks each grid by a UUID and keeps layout metadata so multiple agents can keep in sync with the same screenshot/scene. + +## Skill layer (OpenClaw integration) + +The `skill/` package is a placeholder for how an agent action would look in OpenClaw. It wraps the server calls, interprets the grid cells, and exposes helpers such as `describe_grid()` and `plan_action()` so future work can plug into the agent toolkit directly. + +## Getting started + +1. Install dependencies: `python -m pip install -r requirements.txt`. +2. Run the server: `uvicorn server.main:app --reload`. +3. Use the skill helper to bootstrap a grid, or wire the REST endpoints into a higher-level agent. + +## Next steps + +- Add real OCR/layout logic so cells understand labels. +- Turn the action planner into a state machine that can focus/double-click/type/drag. +- Persist grid sessions for longer running interactions. +- Ship the OpenClaw skill (skill folder) as a plugin that can call `http://localhost:8000` and scaffold the agent’s reasoning. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a99cc4e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +fastapi>=0.105.2 +uvicorn[standard]>=0.23.2 +pydantic>=2.8.2 +httpx>=0.30.0 diff --git a/server/__init__.py b/server/__init__.py new file mode 100644 index 0000000..ee978aa --- /dev/null +++ b/server/__init__.py @@ -0,0 +1 @@ +from .main import app # noqa: F401 diff --git a/server/actions.py b/server/actions.py new file mode 100644 index 0000000..8c8f72b --- /dev/null +++ b/server/actions.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from typing import Tuple + +from .models import ActionPayload, ActionResult + + +class ActionEngine: + def __init__(self, grid) -> None: + self.grid = grid + + def plan(self, payload: ActionPayload) -> ActionResult: + coords = self._resolve_coords(payload.target_cell) + detail = self._describe(payload, coords) + return ActionResult( + success=True, + detail=detail, + coordinates=coords, + payload={"comment": payload.comment or "", "text": payload.text or ""}, + ) + + def _resolve_coords(self, target_cell: str | None) -> Tuple[int, int] | None: + if not target_cell: + return None + return self.grid.resolve_cell_center(target_cell) + + def _describe( + self, payload: ActionPayload, coords: Tuple[int, int] | None + ) -> str: + cell_info = payload.target_cell or "free space" + location = f"@{cell_info}" if coords else "(no target)" + action_hint = payload.action.value + extra = f" text='{payload.text}'" if payload.text else "" + return f"Plan {action_hint} {location}{extra}" diff --git a/server/config.py b/server/config.py new file mode 100644 index 0000000..35e2808 --- /dev/null +++ b/server/config.py @@ -0,0 +1,15 @@ +from pathlib import Path + +from pydantic import BaseSettings + + +class ServerSettings(BaseSettings): + grid_rows: int = 4 + grid_cols: int = 4 + cell_margin_px: int = 4 + storage_dir: Path = Path("data/screenshots") + default_timeout: int = 10 + + class Config: + env_prefix = "CLICKTHROUGH_" + env_file = ".env" diff --git a/server/grid.py b/server/grid.py new file mode 100644 index 0000000..b1a1a53 --- /dev/null +++ b/server/grid.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, Tuple +import uuid + +from .actions import ActionEngine +from .config import ServerSettings +from .models import ( + ActionPayload, + ActionResult, + GridCellModel, + GridDescriptor, + GridInitRequest, +) + + +@dataclass +class _StoredCell: + model: GridCellModel + center: Tuple[int, int] + + +class VisionGrid: + def __init__(self, request: GridInitRequest, grid_id: str, rows: int, columns: int): + self.grid_id = grid_id + self.screenshot = request.screenshot_base64 + self.memo = request.memo + self.rows = rows + self.columns = columns + self.width = request.width + self.height = request.height + self.cells: Dict[str, _StoredCell] = {} + self._engine = ActionEngine(self) + self._build_cells() + + def _build_cells(self, margin: int = 4) -> None: + cell_width = max(1, self.width // self.columns) + cell_height = max(1, self.height // self.rows) + + for row in range(self.rows): + for col in range(self.columns): + left = col * cell_width + margin + top = row * cell_height + margin + right = min(self.width - margin, (col + 1) * cell_width - margin) + bottom = min(self.height - margin, (row + 1) * cell_height - margin) + cell_id = f"{self.grid_id}-{row}-{col}" + bounds = (left, top, right, bottom) + center = ((left + right) // 2, (top + bottom) // 2) + cell = GridCellModel( + cell_id=cell_id, + row=row, + column=col, + bounds=bounds, + ) + self.cells[cell_id] = _StoredCell(model=cell, center=center) + + def describe(self) -> GridDescriptor: + return GridDescriptor( + grid_id=self.grid_id, + rows=self.rows, + columns=self.columns, + cells=[cell.model for cell in self.cells.values()], + metadata={ + "memo": self.memo or "", + "width": self.width, + "height": self.height, + }, + ) + + def resolve_cell_center(self, cell_id: str) -> Tuple[int, int]: + cell = self.cells.get(cell_id) + if not cell: + raise KeyError(f"Unknown cell {cell_id}") + return cell.center + + def apply_action(self, payload: ActionPayload) -> ActionResult: + return self._engine.plan(payload) + + +class GridManager: + def __init__(self, settings: ServerSettings): + self.settings = settings + self._grids: Dict[str, VisionGrid] = {} + + @property + def grid_count(self) -> int: + return len(self._grids) + + def create_grid(self, request: GridInitRequest) -> VisionGrid: + rows = request.rows or self.settings.grid_rows + columns = request.columns or self.settings.grid_cols + grid_id = uuid.uuid4().hex + grid = VisionGrid(request, grid_id, rows, columns) + self._grids[grid_id] = grid + return grid + + def get_grid(self, grid_id: str) -> VisionGrid: + try: + return self._grids[grid_id] + except KeyError as exc: + raise KeyError(f"Grid {grid_id} not found") from exc diff --git a/server/main.py b/server/main.py new file mode 100644 index 0000000..370d532 --- /dev/null +++ b/server/main.py @@ -0,0 +1,35 @@ +from fastapi import FastAPI, HTTPException + +from .config import ServerSettings +from .grid import GridManager +from .models import ActionPayload, GridDescriptor, GridInitRequest + + +settings = ServerSettings() +manager = GridManager(settings) + +app = FastAPI( + title="Clickthrough", + description="Grid-aware surface that lets an agent plan clicks, drags, and typing on a fake screenshot", + version="0.1.0", +) + + +@app.get("/health") +def health_check() -> dict[str, str]: + return {"status": "ok", "grid_count": str(manager.grid_count)} + + +@app.post("/grid/init", response_model=GridDescriptor) +def init_grid(request: GridInitRequest) -> GridDescriptor: + grid = manager.create_grid(request) + return grid.describe() + + +@app.post("/grid/action") +def apply_action(payload: ActionPayload): + try: + grid = manager.get_grid(payload.grid_id) + except KeyError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + return grid.apply_action(payload) diff --git a/server/models.py b/server/models.py new file mode 100644 index 0000000..d5c75da --- /dev/null +++ b/server/models.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from enum import Enum +from typing import Any, Dict, List, Optional, Tuple + +from pydantic import BaseModel, Field + + +class ActionType(str, Enum): + CLICK = "click" + DOUBLE_CLICK = "double_click" + DRAG = "drag" + TYPE = "type" + SCROLL = "scroll" + + +class GridInitRequest(BaseModel): + width: int + height: int + screenshot_base64: str + rows: Optional[int] = None + columns: Optional[int] = None + memo: Optional[str] = None + + +class GridCellModel(BaseModel): + cell_id: str + row: int + column: int + bounds: Tuple[int, int, int, int] + label: Optional[str] = None + + +class GridDescriptor(BaseModel): + grid_id: str + rows: int + columns: int + cells: List[GridCellModel] + metadata: Dict[str, Any] = Field(default_factory=dict) + + +class ActionPayload(BaseModel): + grid_id: str + action: ActionType + target_cell: Optional[str] = None + text: Optional[str] = None + comment: Optional[str] = None + data: Dict[str, Any] = Field(default_factory=dict) + + +class ActionResult(BaseModel): + success: bool + detail: str + coordinates: Optional[Tuple[int, int]] = None + payload: Dict[str, Any] = Field(default_factory=dict) diff --git a/skill/__init__.py b/skill/__init__.py new file mode 100644 index 0000000..97127b5 --- /dev/null +++ b/skill/__init__.py @@ -0,0 +1,5 @@ +"""Utility helpers for the Clickthrough agent skill.""" + +from .clickthrough_skill import ActionPlan, ClickthroughSkill + +__all__ = ["ClickthroughSkill", "ActionPlan"] diff --git a/skill/clickthrough_skill.py b/skill/clickthrough_skill.py new file mode 100644 index 0000000..eaa9dcc --- /dev/null +++ b/skill/clickthrough_skill.py @@ -0,0 +1,64 @@ +from dataclasses import dataclass +from typing import Any, Dict + +import httpx + + +@dataclass +class ActionPlan: + grid_id: str + target_cell: str | None + action: str + text: str | None = None + + +class ClickthroughSkill: + """Lightweight wrapper around the Clickthrough HTTP API.""" + + def __init__(self, server_url: str = "http://localhost:8000") -> None: + self._client = httpx.Client(base_url=server_url, timeout=10) + + def describe_grid( + self, + screenshot_base64: str, + width: int, + height: int, + rows: int = 4, + columns: int = 4, + ) -> Dict[str, Any]: + payload = { + "width": width, + "height": height, + "rows": rows, + "columns": columns, + "screenshot_base64": screenshot_base64, + "memo": "agent-powered grid", + } + response = self._client.post("/grid/init", json=payload) + response.raise_for_status() + return response.json() + + def plan_action(self, plan: ActionPlan) -> Dict[str, Any]: + payload = { + "grid_id": plan.grid_id, + "action": plan.action, + "target_cell": plan.target_cell, + "text": plan.text, + "comment": "skill-generated plan", + } + response = self._client.post("/grid/action", json=payload) + response.raise_for_status() + return response.json() + + +if __name__ == "__main__": + import base64 + + dummy = base64.b64encode(b"fake-screenshot").decode() + skill = ClickthroughSkill() + grid = skill.describe_grid(dummy, width=800, height=600) + print("Grid cells:", len(grid.get("cells", []))) + if grid.get("cells"): + first_cell = grid["cells"][0]["cell_id"] + result = skill.plan_action(ActionPlan(grid_id=grid["grid_id"], target_cell=first_cell, action="click")) + print("Action result:", result)