feat(exec): add low-friction shell execution endpoint #6

Merged
luna merged 2 commits from feat/exec-endpoint into main 2026-04-05 20:23:48 +02:00
5 changed files with 199 additions and 3 deletions

View File

@@ -7,3 +7,10 @@ CLICKTHROUGH_DRY_RUN=false
CLICKTHROUGH_GRID_ROWS=12 CLICKTHROUGH_GRID_ROWS=12
CLICKTHROUGH_GRID_COLS=12 CLICKTHROUGH_GRID_COLS=12
# CLICKTHROUGH_ALLOWED_REGION=0,0,1920,1080 # CLICKTHROUGH_ALLOWED_REGION=0,0,1920,1080
CLICKTHROUGH_EXEC_ENABLED=true
CLICKTHROUGH_EXEC_SECRET=replace-with-a-strong-random-secret
CLICKTHROUGH_EXEC_DEFAULT_SHELL=powershell
CLICKTHROUGH_EXEC_TIMEOUT_S=30
CLICKTHROUGH_EXEC_MAX_TIMEOUT_S=120
CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS=20000

View File

@@ -7,6 +7,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes) - **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes)
- **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported) - **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported)
- **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey - **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
- **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
@@ -48,6 +49,12 @@ Environment variables:
- `CLICKTHROUGH_GRID_ROWS` (default `12`) - `CLICKTHROUGH_GRID_ROWS` (default `12`)
- `CLICKTHROUGH_GRID_COLS` (default `12`) - `CLICKTHROUGH_GRID_COLS` (default `12`)
- `CLICKTHROUGH_ALLOWED_REGION` (optional `x,y,width,height`) - `CLICKTHROUGH_ALLOWED_REGION` (optional `x,y,width,height`)
- `CLICKTHROUGH_EXEC_ENABLED` (default `true`)
- `CLICKTHROUGH_EXEC_SECRET` (**required for `/exec` to run**)
- `CLICKTHROUGH_EXEC_DEFAULT_SHELL` (default `powershell`; one of `powershell`, `bash`, `cmd`)
- `CLICKTHROUGH_EXEC_TIMEOUT_S` (default `30`)
- `CLICKTHROUGH_EXEC_MAX_TIMEOUT_S` (default `120`)
- `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS` (default `20000`)
## Gitea CI ## Gitea CI

View File

@@ -17,5 +17,9 @@
- CI workflow runs syntax checks on push + PR - CI workflow runs syntax checks on push + PR
## Next ## Next
- Manual runtime test on a desktop session (capture + click loop) - [x] Add `POST /exec` endpoint (PowerShell/Bash/CMD) with timeout + stdout/stderr
- Optional: add monitor selection and OCR helper endpoint - [x] Add exec configuration via env (`CLICKTHROUGH_EXEC_*`)
- [x] Document exec API + config
- [x] Create backlog issues for OCR/find/window/input/session-state improvements
- [ ] Open PR for exec feature branch and review/merge
- [x] Require configured exec secret + per-request exec secret header

View File

@@ -10,7 +10,7 @@ x-clickthrough-token: <token>
## `GET /health` ## `GET /health`
Returns status and runtime safety flags. Returns status and runtime safety flags, including `exec` capability config.
## `GET /screen` ## `GET /screen`
@@ -143,6 +143,33 @@ Hotkey:
} }
``` ```
## `POST /exec`
Execute a shell command on the host running Clickthrough.
Requirements:
- `CLICKTHROUGH_EXEC_SECRET` must be configured on the server
- send header `x-clickthrough-exec-secret: <secret>`
```json
{
"command": "Get-Process | Select-Object -First 5",
"shell": "powershell",
"timeout_s": 20,
"cwd": "C:/Users/Paul",
"dry_run": false
}
```
Notes:
- `shell` supports `powershell`, `bash`, `cmd`
- if `shell` is omitted, server uses `CLICKTHROUGH_EXEC_DEFAULT_SHELL`
- output is truncated based on `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS`
- endpoint can be disabled with `CLICKTHROUGH_EXEC_ENABLED=false`
- if `CLICKTHROUGH_EXEC_SECRET` is missing, `/exec` is blocked (`403`)
Response includes `stdout`, `stderr`, `exit_code`, timeout state, and execution metadata.
## `POST /batch` ## `POST /batch`
Runs multiple `action` payloads sequentially. Runs multiple `action` payloads sequentially.

View File

@@ -1,6 +1,8 @@
import base64 import base64
import hmac
import io import io
import os import os
import subprocess
import time import time
import uuid import uuid
from typing import Literal, Optional from typing import Literal, Optional
@@ -43,6 +45,12 @@ SETTINGS = {
"default_grid_rows": int(os.getenv("CLICKTHROUGH_GRID_ROWS", "12")), "default_grid_rows": int(os.getenv("CLICKTHROUGH_GRID_ROWS", "12")),
"default_grid_cols": int(os.getenv("CLICKTHROUGH_GRID_COLS", "12")), "default_grid_cols": int(os.getenv("CLICKTHROUGH_GRID_COLS", "12")),
"allowed_region": _parse_allowed_region(), "allowed_region": _parse_allowed_region(),
"exec_enabled": _env_bool("CLICKTHROUGH_EXEC_ENABLED", True),
"exec_default_shell": os.getenv("CLICKTHROUGH_EXEC_DEFAULT_SHELL", "powershell").strip().lower(),
"exec_default_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_TIMEOUT_S", "30")),
"exec_max_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_MAX_TIMEOUT_S", "120")),
"exec_max_output_chars": int(os.getenv("CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS", "20000")),
"exec_secret": os.getenv("CLICKTHROUGH_EXEC_SECRET", "").strip(),
} }
@@ -130,6 +138,14 @@ class BatchRequest(BaseModel):
stop_on_error: bool = True stop_on_error: bool = True
class ExecRequest(BaseModel):
command: str = Field(min_length=1, max_length=10000)
shell: Literal["powershell", "bash", "cmd"] | None = None
timeout_s: int | None = Field(default=None, ge=1, le=600)
cwd: str | None = None
dry_run: bool = False
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)): def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
token = SETTINGS["token"] token = SETTINGS["token"]
if token and x_clickthrough_token != token: if token and x_clickthrough_token != token:
@@ -259,6 +275,113 @@ def _import_input_lib():
raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc
def _pick_shell(explicit_shell: str | None) -> str:
shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip()
if shell_name not in {"powershell", "bash", "cmd"}:
raise HTTPException(status_code=400, detail="unsupported shell")
return shell_name
def _truncate_text(text: str, limit: int) -> tuple[str, bool]:
if len(text) <= limit:
return text, False
return text[:limit], True
def _resolve_exec_program(shell_name: str, command: str) -> list[str]:
if shell_name == "powershell":
return ["powershell", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-Command", command]
if shell_name == "bash":
return ["bash", "-lc", command]
if shell_name == "cmd":
return ["cmd", "/c", command]
raise HTTPException(status_code=400, detail="unsupported shell")
def _exec_command(req: ExecRequest) -> dict:
if not SETTINGS["exec_enabled"]:
raise HTTPException(status_code=403, detail="exec endpoint disabled")
if not SETTINGS["exec_secret"]:
raise HTTPException(status_code=403, detail="exec secret not configured")
run_dry = SETTINGS["dry_run"] or req.dry_run
shell_name = _pick_shell(req.shell)
timeout_s = req.timeout_s if req.timeout_s is not None else SETTINGS["exec_default_timeout_s"]
timeout_s = min(timeout_s, SETTINGS["exec_max_timeout_s"])
cwd = None
if req.cwd:
cwd = os.path.abspath(req.cwd)
if not os.path.isdir(cwd):
raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory")
argv = _resolve_exec_program(shell_name, req.command)
if run_dry:
return {
"executed": False,
"dry_run": True,
"shell": shell_name,
"command": req.command,
"argv": argv,
"timeout_s": timeout_s,
"cwd": cwd,
}
start = time.time()
try:
completed = subprocess.run(
argv,
cwd=cwd,
capture_output=True,
text=True,
timeout=timeout_s,
check=False,
)
except subprocess.TimeoutExpired as exc:
stdout = exc.stdout or ""
stderr = exc.stderr or ""
stdout, stdout_truncated = _truncate_text(str(stdout), SETTINGS["exec_max_output_chars"])
stderr, stderr_truncated = _truncate_text(str(stderr), SETTINGS["exec_max_output_chars"])
return {
"executed": True,
"timed_out": True,
"shell": shell_name,
"command": req.command,
"argv": argv,
"timeout_s": timeout_s,
"cwd": cwd,
"duration_ms": int((time.time() - start) * 1000),
"exit_code": None,
"stdout": stdout,
"stderr": stderr,
"stdout_truncated": stdout_truncated,
"stderr_truncated": stderr_truncated,
}
except FileNotFoundError as exc:
raise HTTPException(status_code=400, detail=f"shell executable not found: {exc}") from exc
stdout, stdout_truncated = _truncate_text(completed.stdout or "", SETTINGS["exec_max_output_chars"])
stderr, stderr_truncated = _truncate_text(completed.stderr or "", SETTINGS["exec_max_output_chars"])
return {
"executed": True,
"timed_out": False,
"shell": shell_name,
"command": req.command,
"argv": argv,
"timeout_s": timeout_s,
"cwd": cwd,
"duration_ms": int((time.time() - start) * 1000),
"exit_code": completed.returncode,
"stdout": stdout,
"stderr": stderr,
"stdout_truncated": stdout_truncated,
"stderr_truncated": stderr_truncated,
}
def _exec_action(req: ActionRequest) -> dict: def _exec_action(req: ActionRequest) -> dict:
run_dry = SETTINGS["dry_run"] or req.dry_run run_dry = SETTINGS["dry_run"] or req.dry_run
@@ -331,6 +454,13 @@ def health(_: None = Depends(_auth)):
"request_id": _request_id(), "request_id": _request_id(),
"dry_run": SETTINGS["dry_run"], "dry_run": SETTINGS["dry_run"],
"allowed_region": SETTINGS["allowed_region"], "allowed_region": SETTINGS["allowed_region"],
"exec": {
"enabled": SETTINGS["exec_enabled"],
"secret_configured": bool(SETTINGS["exec_secret"]),
"default_shell": SETTINGS["exec_default_shell"],
"default_timeout_s": SETTINGS["exec_default_timeout_s"],
"max_timeout_s": SETTINGS["exec_max_timeout_s"],
},
} }
@@ -449,6 +579,27 @@ def action(req: ActionRequest, _: None = Depends(_auth)):
} }
@app.post("/exec")
def exec_command(
req: ExecRequest,
x_clickthrough_exec_secret: Optional[str] = Header(default=None),
_: None = Depends(_auth),
):
expected = SETTINGS["exec_secret"]
if not expected:
raise HTTPException(status_code=403, detail="exec secret not configured")
if not x_clickthrough_exec_secret or not hmac.compare_digest(x_clickthrough_exec_secret, expected):
raise HTTPException(status_code=401, detail="invalid exec secret")
result = _exec_command(req)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/batch") @app.post("/batch")
def batch(req: BatchRequest, _: None = Depends(_auth)): def batch(req: BatchRequest, _: None = Depends(_auth)):
results = [] results = []