init

2026-04-05 19:15:12 +02:00
parent 101753fa14
commit a2ef50401b
10 changed files with 347 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -1,2 +1,33 @@
 # Clickthrough
-Let an Agent interact with your Computer.
+
+Let an Agent interact with your Computer.
+
+`Clickthrough` is a proof-of-concept bridge between a vision-aware agent and a headless controller. The project is split into two halves:
+
+1. A Python server that accepts a static grid overlay (think of a screenshot broken into cells) and exposes lightweight endpoints to ask questions, plan actions, or even run pointer/keyboard events.
+2. A **skill** that bundles the HTTP calls/intent construction so we can hardwire the same flow inside OpenClaw later.
+
+## Server surface (FastAPI)
+
+- `POST /grid/init`: Accepts a base64 screenshot plus the requested rows/columns, returns a `grid_id`, cell bounds, and helpful metadata. The grid is stored in-memory so the agent can reference cells by ID in later actions.
+- `POST /grid/action`: Takes a plan (`grid_id`, optional target cell, and an action like `click`/`drag`/`type`) and returns a structured `ActionResult` with computed coordinates for tooling to consume.
+- `GET /health`: A minimal health check for deployments.
+
+The server tracks each grid by a UUID and keeps layout metadata so multiple agents can keep in sync with the same screenshot/scene.
+
+## Skill layer (OpenClaw integration)
+
+The `skill/` package is a placeholder for how an agent action would look in OpenClaw. It wraps the server calls, interprets the grid cells, and exposes helpers such as `describe_grid()` and `plan_action()` so future work can plug into the agent toolkit directly.
+
+## Getting started
+
+1. Install dependencies: `python -m pip install -r requirements.txt`.
+2. Run the server: `uvicorn server.main:app --reload`.
+3. Use the skill helper to bootstrap a grid, or wire the REST endpoints into a higher-level agent.
+
+## Next steps
+
+- Add real OCR/layout logic so cells understand labels.
+- Turn the action planner into a state machine that can focus/double-click/type/drag.
+- Persist grid sessions for longer running interactions.
+- Ship the OpenClaw skill (skill folder) as a plugin that can call `http://localhost:8000` and scaffold the agent’s reasoning.
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+fastapi>=0.105.2
+uvicorn[standard]>=0.23.2
+pydantic>=2.8.2
+httpx>=0.30.0
--- a/server/init.py
+++ b/server/init.py
@@ -0,0 +1 @@
+from .main import app  # noqa: F401
--- a/server/actions.py
+++ b/server/actions.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from typing import Tuple
+
+from .models import ActionPayload, ActionResult
+
+
+class ActionEngine:
+    def __init__(self, grid) -> None:
+        self.grid = grid
+
+    def plan(self, payload: ActionPayload) -> ActionResult:
+        coords = self._resolve_coords(payload.target_cell)
+        detail = self._describe(payload, coords)
+        return ActionResult(
+            success=True,
+            detail=detail,
+            coordinates=coords,
+            payload={"comment": payload.comment or "", "text": payload.text or ""},
+        )
+
+    def _resolve_coords(self, target_cell: str | None) -> Tuple[int, int] | None:
+        if not target_cell:
+            return None
+        return self.grid.resolve_cell_center(target_cell)
+
+    def _describe(
+        self, payload: ActionPayload, coords: Tuple[int, int] | None
+    ) -> str:
+        cell_info = payload.target_cell or "free space"
+        location = f"@{cell_info}" if coords else "(no target)"
+        action_hint = payload.action.value
+        extra = f" text='{payload.text}'" if payload.text else ""
+        return f"Plan {action_hint} {location}{extra}"
--- a/server/config.py
+++ b/server/config.py
@@ -0,0 +1,15 @@
+from pathlib import Path
+
+from pydantic import BaseSettings
+
+
+class ServerSettings(BaseSettings):
+    grid_rows: int = 4
+    grid_cols: int = 4
+    cell_margin_px: int = 4
+    storage_dir: Path = Path("data/screenshots")
+    default_timeout: int = 10
+
+    class Config:
+        env_prefix = "CLICKTHROUGH_"
+        env_file = ".env"
--- a/server/grid.py
+++ b/server/grid.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, Tuple
+import uuid
+
+from .actions import ActionEngine
+from .config import ServerSettings
+from .models import (
+    ActionPayload,
+    ActionResult,
+    GridCellModel,
+    GridDescriptor,
+    GridInitRequest,
+)
+
+
+@dataclass
+class _StoredCell:
+    model: GridCellModel
+    center: Tuple[int, int]
+
+
+class VisionGrid:
+    def __init__(self, request: GridInitRequest, grid_id: str, rows: int, columns: int):
+        self.grid_id = grid_id
+        self.screenshot = request.screenshot_base64
+        self.memo = request.memo
+        self.rows = rows
+        self.columns = columns
+        self.width = request.width
+        self.height = request.height
+        self.cells: Dict[str, _StoredCell] = {}
+        self._engine = ActionEngine(self)
+        self._build_cells()
+
+    def _build_cells(self, margin: int = 4) -> None:
+        cell_width = max(1, self.width // self.columns)
+        cell_height = max(1, self.height // self.rows)
+
+        for row in range(self.rows):
+            for col in range(self.columns):
+                left = col * cell_width + margin
+                top = row * cell_height + margin
+                right = min(self.width - margin, (col + 1) * cell_width - margin)
+                bottom = min(self.height - margin, (row + 1) * cell_height - margin)
+                cell_id = f"{self.grid_id}-{row}-{col}"
+                bounds = (left, top, right, bottom)
+                center = ((left + right) // 2, (top + bottom) // 2)
+                cell = GridCellModel(
+                    cell_id=cell_id,
+                    row=row,
+                    column=col,
+                    bounds=bounds,
+                )
+                self.cells[cell_id] = _StoredCell(model=cell, center=center)
+
+    def describe(self) -> GridDescriptor:
+        return GridDescriptor(
+            grid_id=self.grid_id,
+            rows=self.rows,
+            columns=self.columns,
+            cells=[cell.model for cell in self.cells.values()],
+            metadata={
+                "memo": self.memo or "",
+                "width": self.width,
+                "height": self.height,
+            },
+        )
+
+    def resolve_cell_center(self, cell_id: str) -> Tuple[int, int]:
+        cell = self.cells.get(cell_id)
+        if not cell:
+            raise KeyError(f"Unknown cell {cell_id}")
+        return cell.center
+
+    def apply_action(self, payload: ActionPayload) -> ActionResult:
+        return self._engine.plan(payload)
+
+
+class GridManager:
+    def __init__(self, settings: ServerSettings):
+        self.settings = settings
+        self._grids: Dict[str, VisionGrid] = {}
+
+    @property
+    def grid_count(self) -> int:
+        return len(self._grids)
+
+    def create_grid(self, request: GridInitRequest) -> VisionGrid:
+        rows = request.rows or self.settings.grid_rows
+        columns = request.columns or self.settings.grid_cols
+        grid_id = uuid.uuid4().hex
+        grid = VisionGrid(request, grid_id, rows, columns)
+        self._grids[grid_id] = grid
+        return grid
+
+    def get_grid(self, grid_id: str) -> VisionGrid:
+        try:
+            return self._grids[grid_id]
+        except KeyError as exc:
+            raise KeyError(f"Grid {grid_id} not found") from exc
--- a/server/main.py
+++ b/server/main.py
@@ -0,0 +1,35 @@
+from fastapi import FastAPI, HTTPException
+
+from .config import ServerSettings
+from .grid import GridManager
+from .models import ActionPayload, GridDescriptor, GridInitRequest
+
+
+settings = ServerSettings()
+manager = GridManager(settings)
+
+app = FastAPI(
+    title="Clickthrough",
+    description="Grid-aware surface that lets an agent plan clicks, drags, and typing on a fake screenshot",
+    version="0.1.0",
+)
+
+
+@app.get("/health")
+def health_check() -> dict[str, str]:
+    return {"status": "ok", "grid_count": str(manager.grid_count)}
+
+
+@app.post("/grid/init", response_model=GridDescriptor)
+def init_grid(request: GridInitRequest) -> GridDescriptor:
+    grid = manager.create_grid(request)
+    return grid.describe()
+
+
+@app.post("/grid/action")
+def apply_action(payload: ActionPayload):
+    try:
+        grid = manager.get_grid(payload.grid_id)
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+    return grid.apply_action(payload)
--- a/server/models.py
+++ b/server/models.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+
+from pydantic import BaseModel, Field
+
+
+class ActionType(str, Enum):
+    CLICK = "click"
+    DOUBLE_CLICK = "double_click"
+    DRAG = "drag"
+    TYPE = "type"
+    SCROLL = "scroll"
+
+
+class GridInitRequest(BaseModel):
+    width: int
+    height: int
+    screenshot_base64: str
+    rows: Optional[int] = None
+    columns: Optional[int] = None
+    memo: Optional[str] = None
+
+
+class GridCellModel(BaseModel):
+    cell_id: str
+    row: int
+    column: int
+    bounds: Tuple[int, int, int, int]
+    label: Optional[str] = None
+
+
+class GridDescriptor(BaseModel):
+    grid_id: str
+    rows: int
+    columns: int
+    cells: List[GridCellModel]
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+
+class ActionPayload(BaseModel):
+    grid_id: str
+    action: ActionType
+    target_cell: Optional[str] = None
+    text: Optional[str] = None
+    comment: Optional[str] = None
+    data: Dict[str, Any] = Field(default_factory=dict)
+
+
+class ActionResult(BaseModel):
+    success: bool
+    detail: str
+    coordinates: Optional[Tuple[int, int]] = None
+    payload: Dict[str, Any] = Field(default_factory=dict)
--- a/skill/init.py
+++ b/skill/init.py
@@ -0,0 +1,5 @@
+"""Utility helpers for the Clickthrough agent skill."""
+
+from .clickthrough_skill import ActionPlan, ClickthroughSkill
+
+__all__ = ["ClickthroughSkill", "ActionPlan"]
--- a/skill/clickthrough_skill.py
+++ b/skill/clickthrough_skill.py
@@ -0,0 +1,64 @@
+from dataclasses import dataclass
+from typing import Any, Dict
+
+import httpx
+
+
+@dataclass
+class ActionPlan:
+    grid_id: str
+    target_cell: str | None
+    action: str
+    text: str | None = None
+
+
+class ClickthroughSkill:
+    """Lightweight wrapper around the Clickthrough HTTP API."""
+
+    def __init__(self, server_url: str = "http://localhost:8000") -> None:
+        self._client = httpx.Client(base_url=server_url, timeout=10)
+
+    def describe_grid(
+        self,
+        screenshot_base64: str,
+        width: int,
+        height: int,
+        rows: int = 4,
+        columns: int = 4,
+    ) -> Dict[str, Any]:
+        payload = {
+            "width": width,
+            "height": height,
+            "rows": rows,
+            "columns": columns,
+            "screenshot_base64": screenshot_base64,
+            "memo": "agent-powered grid",
+        }
+        response = self._client.post("/grid/init", json=payload)
+        response.raise_for_status()
+        return response.json()
+
+    def plan_action(self, plan: ActionPlan) -> Dict[str, Any]:
+        payload = {
+            "grid_id": plan.grid_id,
+            "action": plan.action,
+            "target_cell": plan.target_cell,
+            "text": plan.text,
+            "comment": "skill-generated plan",
+        }
+        response = self._client.post("/grid/action", json=payload)
+        response.raise_for_status()
+        return response.json()
+
+
+if __name__ == "__main__":
+    import base64
+
+    dummy = base64.b64encode(b"fake-screenshot").decode()
+    skill = ClickthroughSkill()
+    grid = skill.describe_grid(dummy, width=800, height=600)
+    print("Grid cells:", len(grid.get("cells", [])))
+    if grid.get("cells"):
+        first_cell = grid["cells"][0]["cell_id"]
+        result = skill.plan_action(ActionPlan(grid_id=grid["grid_id"], target_cell=first_cell, action="click"))
+        print("Action result:", result)