init
This commit is contained in:
33
README.md
33
README.md
@@ -1,2 +1,33 @@
|
||||
# Clickthrough
|
||||
Let an Agent interact with your Computer.
|
||||
|
||||
Let an Agent interact with your Computer.
|
||||
|
||||
`Clickthrough` is a proof-of-concept bridge between a vision-aware agent and a headless controller. The project is split into two halves:
|
||||
|
||||
1. A Python server that accepts a static grid overlay (think of a screenshot broken into cells) and exposes lightweight endpoints to ask questions, plan actions, or even run pointer/keyboard events.
|
||||
2. A **skill** that bundles the HTTP calls/intent construction so we can hardwire the same flow inside OpenClaw later.
|
||||
|
||||
## Server surface (FastAPI)
|
||||
|
||||
- `POST /grid/init`: Accepts a base64 screenshot plus the requested rows/columns, returns a `grid_id`, cell bounds, and helpful metadata. The grid is stored in-memory so the agent can reference cells by ID in later actions.
|
||||
- `POST /grid/action`: Takes a plan (`grid_id`, optional target cell, and an action like `click`/`drag`/`type`) and returns a structured `ActionResult` with computed coordinates for tooling to consume.
|
||||
- `GET /health`: A minimal health check for deployments.
|
||||
|
||||
The server tracks each grid by a UUID and keeps layout metadata so multiple agents can keep in sync with the same screenshot/scene.
|
||||
|
||||
## Skill layer (OpenClaw integration)
|
||||
|
||||
The `skill/` package is a placeholder for how an agent action would look in OpenClaw. It wraps the server calls, interprets the grid cells, and exposes helpers such as `describe_grid()` and `plan_action()` so future work can plug into the agent toolkit directly.
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install dependencies: `python -m pip install -r requirements.txt`.
|
||||
2. Run the server: `uvicorn server.main:app --reload`.
|
||||
3. Use the skill helper to bootstrap a grid, or wire the REST endpoints into a higher-level agent.
|
||||
|
||||
## Next steps
|
||||
|
||||
- Add real OCR/layout logic so cells understand labels.
|
||||
- Turn the action planner into a state machine that can focus/double-click/type/drag.
|
||||
- Persist grid sessions for longer running interactions.
|
||||
- Ship the OpenClaw skill (skill folder) as a plugin that can call `http://localhost:8000` and scaffold the agent’s reasoning.
|
||||
|
||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastapi>=0.105.2
|
||||
uvicorn[standard]>=0.23.2
|
||||
pydantic>=2.8.2
|
||||
httpx>=0.30.0
|
||||
1
server/__init__.py
Normal file
1
server/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .main import app # noqa: F401
|
||||
34
server/actions.py
Normal file
34
server/actions.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
from .models import ActionPayload, ActionResult
|
||||
|
||||
|
||||
class ActionEngine:
|
||||
def __init__(self, grid) -> None:
|
||||
self.grid = grid
|
||||
|
||||
def plan(self, payload: ActionPayload) -> ActionResult:
|
||||
coords = self._resolve_coords(payload.target_cell)
|
||||
detail = self._describe(payload, coords)
|
||||
return ActionResult(
|
||||
success=True,
|
||||
detail=detail,
|
||||
coordinates=coords,
|
||||
payload={"comment": payload.comment or "", "text": payload.text or ""},
|
||||
)
|
||||
|
||||
def _resolve_coords(self, target_cell: str | None) -> Tuple[int, int] | None:
|
||||
if not target_cell:
|
||||
return None
|
||||
return self.grid.resolve_cell_center(target_cell)
|
||||
|
||||
def _describe(
|
||||
self, payload: ActionPayload, coords: Tuple[int, int] | None
|
||||
) -> str:
|
||||
cell_info = payload.target_cell or "free space"
|
||||
location = f"@{cell_info}" if coords else "(no target)"
|
||||
action_hint = payload.action.value
|
||||
extra = f" text='{payload.text}'" if payload.text else ""
|
||||
return f"Plan {action_hint} {location}{extra}"
|
||||
15
server/config.py
Normal file
15
server/config.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseSettings
|
||||
|
||||
|
||||
class ServerSettings(BaseSettings):
|
||||
grid_rows: int = 4
|
||||
grid_cols: int = 4
|
||||
cell_margin_px: int = 4
|
||||
storage_dir: Path = Path("data/screenshots")
|
||||
default_timeout: int = 10
|
||||
|
||||
class Config:
|
||||
env_prefix = "CLICKTHROUGH_"
|
||||
env_file = ".env"
|
||||
102
server/grid.py
Normal file
102
server/grid.py
Normal file
@@ -0,0 +1,102 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Tuple
|
||||
import uuid
|
||||
|
||||
from .actions import ActionEngine
|
||||
from .config import ServerSettings
|
||||
from .models import (
|
||||
ActionPayload,
|
||||
ActionResult,
|
||||
GridCellModel,
|
||||
GridDescriptor,
|
||||
GridInitRequest,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _StoredCell:
|
||||
model: GridCellModel
|
||||
center: Tuple[int, int]
|
||||
|
||||
|
||||
class VisionGrid:
|
||||
def __init__(self, request: GridInitRequest, grid_id: str, rows: int, columns: int):
|
||||
self.grid_id = grid_id
|
||||
self.screenshot = request.screenshot_base64
|
||||
self.memo = request.memo
|
||||
self.rows = rows
|
||||
self.columns = columns
|
||||
self.width = request.width
|
||||
self.height = request.height
|
||||
self.cells: Dict[str, _StoredCell] = {}
|
||||
self._engine = ActionEngine(self)
|
||||
self._build_cells()
|
||||
|
||||
def _build_cells(self, margin: int = 4) -> None:
|
||||
cell_width = max(1, self.width // self.columns)
|
||||
cell_height = max(1, self.height // self.rows)
|
||||
|
||||
for row in range(self.rows):
|
||||
for col in range(self.columns):
|
||||
left = col * cell_width + margin
|
||||
top = row * cell_height + margin
|
||||
right = min(self.width - margin, (col + 1) * cell_width - margin)
|
||||
bottom = min(self.height - margin, (row + 1) * cell_height - margin)
|
||||
cell_id = f"{self.grid_id}-{row}-{col}"
|
||||
bounds = (left, top, right, bottom)
|
||||
center = ((left + right) // 2, (top + bottom) // 2)
|
||||
cell = GridCellModel(
|
||||
cell_id=cell_id,
|
||||
row=row,
|
||||
column=col,
|
||||
bounds=bounds,
|
||||
)
|
||||
self.cells[cell_id] = _StoredCell(model=cell, center=center)
|
||||
|
||||
def describe(self) -> GridDescriptor:
|
||||
return GridDescriptor(
|
||||
grid_id=self.grid_id,
|
||||
rows=self.rows,
|
||||
columns=self.columns,
|
||||
cells=[cell.model for cell in self.cells.values()],
|
||||
metadata={
|
||||
"memo": self.memo or "",
|
||||
"width": self.width,
|
||||
"height": self.height,
|
||||
},
|
||||
)
|
||||
|
||||
def resolve_cell_center(self, cell_id: str) -> Tuple[int, int]:
|
||||
cell = self.cells.get(cell_id)
|
||||
if not cell:
|
||||
raise KeyError(f"Unknown cell {cell_id}")
|
||||
return cell.center
|
||||
|
||||
def apply_action(self, payload: ActionPayload) -> ActionResult:
|
||||
return self._engine.plan(payload)
|
||||
|
||||
|
||||
class GridManager:
|
||||
def __init__(self, settings: ServerSettings):
|
||||
self.settings = settings
|
||||
self._grids: Dict[str, VisionGrid] = {}
|
||||
|
||||
@property
|
||||
def grid_count(self) -> int:
|
||||
return len(self._grids)
|
||||
|
||||
def create_grid(self, request: GridInitRequest) -> VisionGrid:
|
||||
rows = request.rows or self.settings.grid_rows
|
||||
columns = request.columns or self.settings.grid_cols
|
||||
grid_id = uuid.uuid4().hex
|
||||
grid = VisionGrid(request, grid_id, rows, columns)
|
||||
self._grids[grid_id] = grid
|
||||
return grid
|
||||
|
||||
def get_grid(self, grid_id: str) -> VisionGrid:
|
||||
try:
|
||||
return self._grids[grid_id]
|
||||
except KeyError as exc:
|
||||
raise KeyError(f"Grid {grid_id} not found") from exc
|
||||
35
server/main.py
Normal file
35
server/main.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
|
||||
from .config import ServerSettings
|
||||
from .grid import GridManager
|
||||
from .models import ActionPayload, GridDescriptor, GridInitRequest
|
||||
|
||||
|
||||
settings = ServerSettings()
|
||||
manager = GridManager(settings)
|
||||
|
||||
app = FastAPI(
|
||||
title="Clickthrough",
|
||||
description="Grid-aware surface that lets an agent plan clicks, drags, and typing on a fake screenshot",
|
||||
version="0.1.0",
|
||||
)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health_check() -> dict[str, str]:
|
||||
return {"status": "ok", "grid_count": str(manager.grid_count)}
|
||||
|
||||
|
||||
@app.post("/grid/init", response_model=GridDescriptor)
|
||||
def init_grid(request: GridInitRequest) -> GridDescriptor:
|
||||
grid = manager.create_grid(request)
|
||||
return grid.describe()
|
||||
|
||||
|
||||
@app.post("/grid/action")
|
||||
def apply_action(payload: ActionPayload):
|
||||
try:
|
||||
grid = manager.get_grid(payload.grid_id)
|
||||
except KeyError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
return grid.apply_action(payload)
|
||||
55
server/models.py
Normal file
55
server/models.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ActionType(str, Enum):
|
||||
CLICK = "click"
|
||||
DOUBLE_CLICK = "double_click"
|
||||
DRAG = "drag"
|
||||
TYPE = "type"
|
||||
SCROLL = "scroll"
|
||||
|
||||
|
||||
class GridInitRequest(BaseModel):
|
||||
width: int
|
||||
height: int
|
||||
screenshot_base64: str
|
||||
rows: Optional[int] = None
|
||||
columns: Optional[int] = None
|
||||
memo: Optional[str] = None
|
||||
|
||||
|
||||
class GridCellModel(BaseModel):
|
||||
cell_id: str
|
||||
row: int
|
||||
column: int
|
||||
bounds: Tuple[int, int, int, int]
|
||||
label: Optional[str] = None
|
||||
|
||||
|
||||
class GridDescriptor(BaseModel):
|
||||
grid_id: str
|
||||
rows: int
|
||||
columns: int
|
||||
cells: List[GridCellModel]
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class ActionPayload(BaseModel):
|
||||
grid_id: str
|
||||
action: ActionType
|
||||
target_cell: Optional[str] = None
|
||||
text: Optional[str] = None
|
||||
comment: Optional[str] = None
|
||||
data: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class ActionResult(BaseModel):
|
||||
success: bool
|
||||
detail: str
|
||||
coordinates: Optional[Tuple[int, int]] = None
|
||||
payload: Dict[str, Any] = Field(default_factory=dict)
|
||||
5
skill/__init__.py
Normal file
5
skill/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Utility helpers for the Clickthrough agent skill."""
|
||||
|
||||
from .clickthrough_skill import ActionPlan, ClickthroughSkill
|
||||
|
||||
__all__ = ["ClickthroughSkill", "ActionPlan"]
|
||||
64
skill/clickthrough_skill.py
Normal file
64
skill/clickthrough_skill.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActionPlan:
|
||||
grid_id: str
|
||||
target_cell: str | None
|
||||
action: str
|
||||
text: str | None = None
|
||||
|
||||
|
||||
class ClickthroughSkill:
|
||||
"""Lightweight wrapper around the Clickthrough HTTP API."""
|
||||
|
||||
def __init__(self, server_url: str = "http://localhost:8000") -> None:
|
||||
self._client = httpx.Client(base_url=server_url, timeout=10)
|
||||
|
||||
def describe_grid(
|
||||
self,
|
||||
screenshot_base64: str,
|
||||
width: int,
|
||||
height: int,
|
||||
rows: int = 4,
|
||||
columns: int = 4,
|
||||
) -> Dict[str, Any]:
|
||||
payload = {
|
||||
"width": width,
|
||||
"height": height,
|
||||
"rows": rows,
|
||||
"columns": columns,
|
||||
"screenshot_base64": screenshot_base64,
|
||||
"memo": "agent-powered grid",
|
||||
}
|
||||
response = self._client.post("/grid/init", json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def plan_action(self, plan: ActionPlan) -> Dict[str, Any]:
|
||||
payload = {
|
||||
"grid_id": plan.grid_id,
|
||||
"action": plan.action,
|
||||
"target_cell": plan.target_cell,
|
||||
"text": plan.text,
|
||||
"comment": "skill-generated plan",
|
||||
}
|
||||
response = self._client.post("/grid/action", json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import base64
|
||||
|
||||
dummy = base64.b64encode(b"fake-screenshot").decode()
|
||||
skill = ClickthroughSkill()
|
||||
grid = skill.describe_grid(dummy, width=800, height=600)
|
||||
print("Grid cells:", len(grid.get("cells", [])))
|
||||
if grid.get("cells"):
|
||||
first_cell = grid["cells"][0]["cell_id"]
|
||||
result = skill.plan_action(ActionPlan(grid_id=grid["grid_id"], target_cell=first_cell, action="click"))
|
||||
print("Action result:", result)
|
||||
Reference in New Issue
Block a user