diff --git a/.env.example b/.env.example index b18441f..2ebd111 100644 --- a/.env.example +++ b/.env.example @@ -7,3 +7,9 @@ CLICKTHROUGH_DRY_RUN=false CLICKTHROUGH_GRID_ROWS=12 CLICKTHROUGH_GRID_COLS=12 # CLICKTHROUGH_ALLOWED_REGION=0,0,1920,1080 + +CLICKTHROUGH_EXEC_ENABLED=true +CLICKTHROUGH_EXEC_DEFAULT_SHELL=powershell +CLICKTHROUGH_EXEC_TIMEOUT_S=30 +CLICKTHROUGH_EXEC_MAX_TIMEOUT_S=120 +CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS=20000 diff --git a/README.md b/README.md index 6184330..cfbd57e 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots - **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes) - **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported) - **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey +- **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec` - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction @@ -48,6 +49,11 @@ Environment variables: - `CLICKTHROUGH_GRID_ROWS` (default `12`) - `CLICKTHROUGH_GRID_COLS` (default `12`) - `CLICKTHROUGH_ALLOWED_REGION` (optional `x,y,width,height`) +- `CLICKTHROUGH_EXEC_ENABLED` (default `true`) +- `CLICKTHROUGH_EXEC_DEFAULT_SHELL` (default `powershell`; one of `powershell`, `bash`, `cmd`) +- `CLICKTHROUGH_EXEC_TIMEOUT_S` (default `30`) +- `CLICKTHROUGH_EXEC_MAX_TIMEOUT_S` (default `120`) +- `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS` (default `20000`) ## Gitea CI diff --git a/TODO.md b/TODO.md index 37240d6..4a326c7 100644 --- a/TODO.md +++ b/TODO.md @@ -17,5 +17,8 @@ - CI workflow runs syntax checks on push + PR ## Next -- Manual runtime test on a desktop session (capture + click loop) -- Optional: add monitor selection and OCR helper endpoint +- [x] Add `POST /exec` endpoint (PowerShell/Bash/CMD) with timeout + stdout/stderr +- [x] Add exec configuration via env (`CLICKTHROUGH_EXEC_*`) +- [x] Document exec API + config +- [x] Create backlog issues for OCR/find/window/input/session-state improvements +- [ ] Open PR for exec feature branch and review/merge diff --git a/docs/API.md b/docs/API.md index 4a266b6..1c9bf7d 100644 --- a/docs/API.md +++ b/docs/API.md @@ -10,7 +10,7 @@ x-clickthrough-token: ## `GET /health` -Returns status and runtime safety flags. +Returns status and runtime safety flags, including `exec` capability config. ## `GET /screen` @@ -143,6 +143,28 @@ Hotkey: } ``` +## `POST /exec` + +Execute a shell command on the host running Clickthrough. + +```json +{ + "command": "Get-Process | Select-Object -First 5", + "shell": "powershell", + "timeout_s": 20, + "cwd": "C:/Users/Paul", + "dry_run": false +} +``` + +Notes: +- `shell` supports `powershell`, `bash`, `cmd` +- if `shell` is omitted, server uses `CLICKTHROUGH_EXEC_DEFAULT_SHELL` +- output is truncated based on `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS` +- endpoint can be disabled with `CLICKTHROUGH_EXEC_ENABLED=false` + +Response includes `stdout`, `stderr`, `exit_code`, timeout state, and execution metadata. + ## `POST /batch` Runs multiple `action` payloads sequentially. diff --git a/server/app.py b/server/app.py index 25ec60e..ab30f76 100644 --- a/server/app.py +++ b/server/app.py @@ -1,6 +1,7 @@ import base64 import io import os +import subprocess import time import uuid from typing import Literal, Optional @@ -43,6 +44,11 @@ SETTINGS = { "default_grid_rows": int(os.getenv("CLICKTHROUGH_GRID_ROWS", "12")), "default_grid_cols": int(os.getenv("CLICKTHROUGH_GRID_COLS", "12")), "allowed_region": _parse_allowed_region(), + "exec_enabled": _env_bool("CLICKTHROUGH_EXEC_ENABLED", True), + "exec_default_shell": os.getenv("CLICKTHROUGH_EXEC_DEFAULT_SHELL", "powershell").strip().lower(), + "exec_default_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_TIMEOUT_S", "30")), + "exec_max_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_MAX_TIMEOUT_S", "120")), + "exec_max_output_chars": int(os.getenv("CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS", "20000")), } @@ -130,6 +136,14 @@ class BatchRequest(BaseModel): stop_on_error: bool = True +class ExecRequest(BaseModel): + command: str = Field(min_length=1, max_length=10000) + shell: Literal["powershell", "bash", "cmd"] | None = None + timeout_s: int | None = Field(default=None, ge=1, le=600) + cwd: str | None = None + dry_run: bool = False + + def _auth(x_clickthrough_token: Optional[str] = Header(default=None)): token = SETTINGS["token"] if token and x_clickthrough_token != token: @@ -259,6 +273,111 @@ def _import_input_lib(): raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc +def _pick_shell(explicit_shell: str | None) -> str: + shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip() + if shell_name not in {"powershell", "bash", "cmd"}: + raise HTTPException(status_code=400, detail="unsupported shell") + return shell_name + + +def _truncate_text(text: str, limit: int) -> tuple[str, bool]: + if len(text) <= limit: + return text, False + return text[:limit], True + + +def _resolve_exec_program(shell_name: str, command: str) -> list[str]: + if shell_name == "powershell": + return ["powershell", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-Command", command] + if shell_name == "bash": + return ["bash", "-lc", command] + if shell_name == "cmd": + return ["cmd", "/c", command] + raise HTTPException(status_code=400, detail="unsupported shell") + + +def _exec_command(req: ExecRequest) -> dict: + if not SETTINGS["exec_enabled"]: + raise HTTPException(status_code=403, detail="exec endpoint disabled") + + run_dry = SETTINGS["dry_run"] or req.dry_run + shell_name = _pick_shell(req.shell) + + timeout_s = req.timeout_s if req.timeout_s is not None else SETTINGS["exec_default_timeout_s"] + timeout_s = min(timeout_s, SETTINGS["exec_max_timeout_s"]) + + cwd = None + if req.cwd: + cwd = os.path.abspath(req.cwd) + if not os.path.isdir(cwd): + raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory") + + argv = _resolve_exec_program(shell_name, req.command) + + if run_dry: + return { + "executed": False, + "dry_run": True, + "shell": shell_name, + "command": req.command, + "argv": argv, + "timeout_s": timeout_s, + "cwd": cwd, + } + + start = time.time() + try: + completed = subprocess.run( + argv, + cwd=cwd, + capture_output=True, + text=True, + timeout=timeout_s, + check=False, + ) + except subprocess.TimeoutExpired as exc: + stdout = exc.stdout or "" + stderr = exc.stderr or "" + stdout, stdout_truncated = _truncate_text(str(stdout), SETTINGS["exec_max_output_chars"]) + stderr, stderr_truncated = _truncate_text(str(stderr), SETTINGS["exec_max_output_chars"]) + return { + "executed": True, + "timed_out": True, + "shell": shell_name, + "command": req.command, + "argv": argv, + "timeout_s": timeout_s, + "cwd": cwd, + "duration_ms": int((time.time() - start) * 1000), + "exit_code": None, + "stdout": stdout, + "stderr": stderr, + "stdout_truncated": stdout_truncated, + "stderr_truncated": stderr_truncated, + } + except FileNotFoundError as exc: + raise HTTPException(status_code=400, detail=f"shell executable not found: {exc}") from exc + + stdout, stdout_truncated = _truncate_text(completed.stdout or "", SETTINGS["exec_max_output_chars"]) + stderr, stderr_truncated = _truncate_text(completed.stderr or "", SETTINGS["exec_max_output_chars"]) + + return { + "executed": True, + "timed_out": False, + "shell": shell_name, + "command": req.command, + "argv": argv, + "timeout_s": timeout_s, + "cwd": cwd, + "duration_ms": int((time.time() - start) * 1000), + "exit_code": completed.returncode, + "stdout": stdout, + "stderr": stderr, + "stdout_truncated": stdout_truncated, + "stderr_truncated": stderr_truncated, + } + + def _exec_action(req: ActionRequest) -> dict: run_dry = SETTINGS["dry_run"] or req.dry_run @@ -331,6 +450,12 @@ def health(_: None = Depends(_auth)): "request_id": _request_id(), "dry_run": SETTINGS["dry_run"], "allowed_region": SETTINGS["allowed_region"], + "exec": { + "enabled": SETTINGS["exec_enabled"], + "default_shell": SETTINGS["exec_default_shell"], + "default_timeout_s": SETTINGS["exec_default_timeout_s"], + "max_timeout_s": SETTINGS["exec_max_timeout_s"], + }, } @@ -449,6 +574,17 @@ def action(req: ActionRequest, _: None = Depends(_auth)): } +@app.post("/exec") +def exec_command(req: ExecRequest, _: None = Depends(_auth)): + result = _exec_command(req) + return { + "ok": True, + "request_id": _request_id(), + "time_ms": _now_ms(), + "result": result, + } + + @app.post("/batch") def batch(req: BatchRequest, _: None = Depends(_auth)): results = []