feat(exec): add low-friction shell execution endpoint #6

Merged
luna merged 2 commits from feat/exec-endpoint into main 2026-04-05 20:23:48 +02:00
5 changed files with 176 additions and 3 deletions
Showing only changes of commit 930cdd2887 - Show all commits

View File

@@ -7,3 +7,9 @@ CLICKTHROUGH_DRY_RUN=false
CLICKTHROUGH_GRID_ROWS=12 CLICKTHROUGH_GRID_ROWS=12
CLICKTHROUGH_GRID_COLS=12 CLICKTHROUGH_GRID_COLS=12
# CLICKTHROUGH_ALLOWED_REGION=0,0,1920,1080 # CLICKTHROUGH_ALLOWED_REGION=0,0,1920,1080
CLICKTHROUGH_EXEC_ENABLED=true
CLICKTHROUGH_EXEC_DEFAULT_SHELL=powershell
CLICKTHROUGH_EXEC_TIMEOUT_S=30
CLICKTHROUGH_EXEC_MAX_TIMEOUT_S=120
CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS=20000

View File

@@ -7,6 +7,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
- **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes) - **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes)
- **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported) - **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported)
- **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey - **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
- **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
- **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
- **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction
@@ -48,6 +49,11 @@ Environment variables:
- `CLICKTHROUGH_GRID_ROWS` (default `12`) - `CLICKTHROUGH_GRID_ROWS` (default `12`)
- `CLICKTHROUGH_GRID_COLS` (default `12`) - `CLICKTHROUGH_GRID_COLS` (default `12`)
- `CLICKTHROUGH_ALLOWED_REGION` (optional `x,y,width,height`) - `CLICKTHROUGH_ALLOWED_REGION` (optional `x,y,width,height`)
- `CLICKTHROUGH_EXEC_ENABLED` (default `true`)
- `CLICKTHROUGH_EXEC_DEFAULT_SHELL` (default `powershell`; one of `powershell`, `bash`, `cmd`)
- `CLICKTHROUGH_EXEC_TIMEOUT_S` (default `30`)
- `CLICKTHROUGH_EXEC_MAX_TIMEOUT_S` (default `120`)
- `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS` (default `20000`)
## Gitea CI ## Gitea CI

View File

@@ -17,5 +17,8 @@
- CI workflow runs syntax checks on push + PR - CI workflow runs syntax checks on push + PR
## Next ## Next
- Manual runtime test on a desktop session (capture + click loop) - [x] Add `POST /exec` endpoint (PowerShell/Bash/CMD) with timeout + stdout/stderr
- Optional: add monitor selection and OCR helper endpoint - [x] Add exec configuration via env (`CLICKTHROUGH_EXEC_*`)
- [x] Document exec API + config
- [x] Create backlog issues for OCR/find/window/input/session-state improvements
- [ ] Open PR for exec feature branch and review/merge

View File

@@ -10,7 +10,7 @@ x-clickthrough-token: <token>
## `GET /health` ## `GET /health`
Returns status and runtime safety flags. Returns status and runtime safety flags, including `exec` capability config.
## `GET /screen` ## `GET /screen`
@@ -143,6 +143,28 @@ Hotkey:
} }
``` ```
## `POST /exec`
Execute a shell command on the host running Clickthrough.
```json
{
"command": "Get-Process | Select-Object -First 5",
"shell": "powershell",
"timeout_s": 20,
"cwd": "C:/Users/Paul",
"dry_run": false
}
```
Notes:
- `shell` supports `powershell`, `bash`, `cmd`
- if `shell` is omitted, server uses `CLICKTHROUGH_EXEC_DEFAULT_SHELL`
- output is truncated based on `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS`
- endpoint can be disabled with `CLICKTHROUGH_EXEC_ENABLED=false`
Response includes `stdout`, `stderr`, `exit_code`, timeout state, and execution metadata.
## `POST /batch` ## `POST /batch`
Runs multiple `action` payloads sequentially. Runs multiple `action` payloads sequentially.

View File

@@ -1,6 +1,7 @@
import base64 import base64
import io import io
import os import os
import subprocess
import time import time
import uuid import uuid
from typing import Literal, Optional from typing import Literal, Optional
@@ -43,6 +44,11 @@ SETTINGS = {
"default_grid_rows": int(os.getenv("CLICKTHROUGH_GRID_ROWS", "12")), "default_grid_rows": int(os.getenv("CLICKTHROUGH_GRID_ROWS", "12")),
"default_grid_cols": int(os.getenv("CLICKTHROUGH_GRID_COLS", "12")), "default_grid_cols": int(os.getenv("CLICKTHROUGH_GRID_COLS", "12")),
"allowed_region": _parse_allowed_region(), "allowed_region": _parse_allowed_region(),
"exec_enabled": _env_bool("CLICKTHROUGH_EXEC_ENABLED", True),
"exec_default_shell": os.getenv("CLICKTHROUGH_EXEC_DEFAULT_SHELL", "powershell").strip().lower(),
"exec_default_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_TIMEOUT_S", "30")),
"exec_max_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_MAX_TIMEOUT_S", "120")),
"exec_max_output_chars": int(os.getenv("CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS", "20000")),
} }
@@ -130,6 +136,14 @@ class BatchRequest(BaseModel):
stop_on_error: bool = True stop_on_error: bool = True
class ExecRequest(BaseModel):
command: str = Field(min_length=1, max_length=10000)
shell: Literal["powershell", "bash", "cmd"] | None = None
timeout_s: int | None = Field(default=None, ge=1, le=600)
cwd: str | None = None
dry_run: bool = False
def _auth(x_clickthrough_token: Optional[str] = Header(default=None)): def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
token = SETTINGS["token"] token = SETTINGS["token"]
if token and x_clickthrough_token != token: if token and x_clickthrough_token != token:
@@ -259,6 +273,111 @@ def _import_input_lib():
raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc
def _pick_shell(explicit_shell: str | None) -> str:
shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip()
if shell_name not in {"powershell", "bash", "cmd"}:
raise HTTPException(status_code=400, detail="unsupported shell")
return shell_name
def _truncate_text(text: str, limit: int) -> tuple[str, bool]:
if len(text) <= limit:
return text, False
return text[:limit], True
def _resolve_exec_program(shell_name: str, command: str) -> list[str]:
if shell_name == "powershell":
return ["powershell", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-Command", command]
if shell_name == "bash":
return ["bash", "-lc", command]
if shell_name == "cmd":
return ["cmd", "/c", command]
raise HTTPException(status_code=400, detail="unsupported shell")
def _exec_command(req: ExecRequest) -> dict:
if not SETTINGS["exec_enabled"]:
raise HTTPException(status_code=403, detail="exec endpoint disabled")
run_dry = SETTINGS["dry_run"] or req.dry_run
shell_name = _pick_shell(req.shell)
timeout_s = req.timeout_s if req.timeout_s is not None else SETTINGS["exec_default_timeout_s"]
timeout_s = min(timeout_s, SETTINGS["exec_max_timeout_s"])
cwd = None
if req.cwd:
cwd = os.path.abspath(req.cwd)
if not os.path.isdir(cwd):
raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory")
argv = _resolve_exec_program(shell_name, req.command)
if run_dry:
return {
"executed": False,
"dry_run": True,
"shell": shell_name,
"command": req.command,
"argv": argv,
"timeout_s": timeout_s,
"cwd": cwd,
}
start = time.time()
try:
completed = subprocess.run(
argv,
cwd=cwd,
capture_output=True,
text=True,
timeout=timeout_s,
check=False,
)
except subprocess.TimeoutExpired as exc:
stdout = exc.stdout or ""
stderr = exc.stderr or ""
stdout, stdout_truncated = _truncate_text(str(stdout), SETTINGS["exec_max_output_chars"])
stderr, stderr_truncated = _truncate_text(str(stderr), SETTINGS["exec_max_output_chars"])
return {
"executed": True,
"timed_out": True,
"shell": shell_name,
"command": req.command,
"argv": argv,
"timeout_s": timeout_s,
"cwd": cwd,
"duration_ms": int((time.time() - start) * 1000),
"exit_code": None,
"stdout": stdout,
"stderr": stderr,
"stdout_truncated": stdout_truncated,
"stderr_truncated": stderr_truncated,
}
except FileNotFoundError as exc:
raise HTTPException(status_code=400, detail=f"shell executable not found: {exc}") from exc
stdout, stdout_truncated = _truncate_text(completed.stdout or "", SETTINGS["exec_max_output_chars"])
stderr, stderr_truncated = _truncate_text(completed.stderr or "", SETTINGS["exec_max_output_chars"])
return {
"executed": True,
"timed_out": False,
"shell": shell_name,
"command": req.command,
"argv": argv,
"timeout_s": timeout_s,
"cwd": cwd,
"duration_ms": int((time.time() - start) * 1000),
"exit_code": completed.returncode,
"stdout": stdout,
"stderr": stderr,
"stdout_truncated": stdout_truncated,
"stderr_truncated": stderr_truncated,
}
def _exec_action(req: ActionRequest) -> dict: def _exec_action(req: ActionRequest) -> dict:
run_dry = SETTINGS["dry_run"] or req.dry_run run_dry = SETTINGS["dry_run"] or req.dry_run
@@ -331,6 +450,12 @@ def health(_: None = Depends(_auth)):
"request_id": _request_id(), "request_id": _request_id(),
"dry_run": SETTINGS["dry_run"], "dry_run": SETTINGS["dry_run"],
"allowed_region": SETTINGS["allowed_region"], "allowed_region": SETTINGS["allowed_region"],
"exec": {
"enabled": SETTINGS["exec_enabled"],
"default_shell": SETTINGS["exec_default_shell"],
"default_timeout_s": SETTINGS["exec_default_timeout_s"],
"max_timeout_s": SETTINGS["exec_max_timeout_s"],
},
} }
@@ -449,6 +574,17 @@ def action(req: ActionRequest, _: None = Depends(_auth)):
} }
@app.post("/exec")
def exec_command(req: ExecRequest, _: None = Depends(_auth)):
result = _exec_command(req)
return {
"ok": True,
"request_id": _request_id(),
"time_ms": _now_ms(),
"result": result,
}
@app.post("/batch") @app.post("/batch")
def batch(req: BatchRequest, _: None = Depends(_auth)): def batch(req: BatchRequest, _: None = Depends(_auth)):
results = [] results = []