feat(exec): add shell command execution endpoint

2026-04-05 20:18:07 +02:00
parent bb247aaad2
commit 930cdd2887
5 changed files with 176 additions and 3 deletions
--- a/.env.example
+++ b/.env.example
@@ -7,3 +7,9 @@ CLICKTHROUGH_DRY_RUN=false
 CLICKTHROUGH_GRID_ROWS=12
 CLICKTHROUGH_GRID_COLS=12
 # CLICKTHROUGH_ALLOWED_REGION=0,0,1920,1080
+
+CLICKTHROUGH_EXEC_ENABLED=true
+CLICKTHROUGH_EXEC_DEFAULT_SHELL=powershell
+CLICKTHROUGH_EXEC_TIMEOUT_S=30
+CLICKTHROUGH_EXEC_MAX_TIMEOUT_S=120
+CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS=20000
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots
 - **Visual endpoints**: full-screen capture with optional grid overlay and labeled cells (`asImage=true` can return raw image bytes)
 - **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported)
 - **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey
+- **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec`
 - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels
 - **Safety knobs**: token auth, dry-run mode, optional allowed-region restriction

@@ -48,6 +49,11 @@ Environment variables:
 - `CLICKTHROUGH_GRID_ROWS` (default `12`)
 - `CLICKTHROUGH_GRID_COLS` (default `12`)
 - `CLICKTHROUGH_ALLOWED_REGION` (optional `x,y,width,height`)
+- `CLICKTHROUGH_EXEC_ENABLED` (default `true`)
+- `CLICKTHROUGH_EXEC_DEFAULT_SHELL` (default `powershell`; one of `powershell`, `bash`, `cmd`)
+- `CLICKTHROUGH_EXEC_TIMEOUT_S` (default `30`)
+- `CLICKTHROUGH_EXEC_MAX_TIMEOUT_S` (default `120`)
+- `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS` (default `20000`)

 ## Gitea CI

--- a/TODO.md
+++ b/TODO.md
@@ -17,5 +17,8 @@
 - CI workflow runs syntax checks on push + PR

 ## Next
- Manual runtime test on a desktop session (capture + click loop)
- Optional: add monitor selection and OCR helper endpoint
+- [x] Add `POST /exec` endpoint (PowerShell/Bash/CMD) with timeout + stdout/stderr
+- [x] Add exec configuration via env (`CLICKTHROUGH_EXEC_*`)
+- [x] Document exec API + config
+- [x] Create backlog issues for OCR/find/window/input/session-state improvements
+- [ ] Open PR for exec feature branch and review/merge
--- a/docs/API.md
+++ b/docs/API.md
@@ -10,7 +10,7 @@ x-clickthrough-token: <token>

 ## `GET /health`

-Returns status and runtime safety flags.
+Returns status and runtime safety flags, including `exec` capability config.

 ## `GET /screen`

@@ -143,6 +143,28 @@ Hotkey:
 }
 ```

+## `POST /exec`
+
+Execute a shell command on the host running Clickthrough.
+
+```json
+{
+  "command": "Get-Process | Select-Object -First 5",
+  "shell": "powershell",
+  "timeout_s": 20,
+  "cwd": "C:/Users/Paul",
+  "dry_run": false
+}
+```
+
+Notes:
+- `shell` supports `powershell`, `bash`, `cmd`
+- if `shell` is omitted, server uses `CLICKTHROUGH_EXEC_DEFAULT_SHELL`
+- output is truncated based on `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS`
+- endpoint can be disabled with `CLICKTHROUGH_EXEC_ENABLED=false`
+
+Response includes `stdout`, `stderr`, `exit_code`, timeout state, and execution metadata.
+
 ## `POST /batch`

 Runs multiple `action` payloads sequentially.
--- a/server/app.py
+++ b/server/app.py
@@ -1,6 +1,7 @@
 import base64
 import io
 import os
+import subprocess
 import time
 import uuid
 from typing import Literal, Optional
@@ -43,6 +44,11 @@ SETTINGS = {
    "default_grid_rows": int(os.getenv("CLICKTHROUGH_GRID_ROWS", "12")),
    "default_grid_cols": int(os.getenv("CLICKTHROUGH_GRID_COLS", "12")),
    "allowed_region": _parse_allowed_region(),
+    "exec_enabled": _env_bool("CLICKTHROUGH_EXEC_ENABLED", True),
+    "exec_default_shell": os.getenv("CLICKTHROUGH_EXEC_DEFAULT_SHELL", "powershell").strip().lower(),
+    "exec_default_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_TIMEOUT_S", "30")),
+    "exec_max_timeout_s": int(os.getenv("CLICKTHROUGH_EXEC_MAX_TIMEOUT_S", "120")),
+    "exec_max_output_chars": int(os.getenv("CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS", "20000")),
 }


@@ -130,6 +136,14 @@ class BatchRequest(BaseModel):
    stop_on_error: bool = True


+class ExecRequest(BaseModel):
+    command: str = Field(min_length=1, max_length=10000)
+    shell: Literal["powershell", "bash", "cmd"] | None = None
+    timeout_s: int | None = Field(default=None, ge=1, le=600)
+    cwd: str | None = None
+    dry_run: bool = False
+
+
 def _auth(x_clickthrough_token: Optional[str] = Header(default=None)):
    token = SETTINGS["token"]
    if token and x_clickthrough_token != token:
@@ -259,6 +273,111 @@ def _import_input_lib():
        raise HTTPException(status_code=500, detail=f"input backend unavailable: {exc}") from exc


+def _pick_shell(explicit_shell: str | None) -> str:
+    shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip()
+    if shell_name not in {"powershell", "bash", "cmd"}:
+        raise HTTPException(status_code=400, detail="unsupported shell")
+    return shell_name
+
+
+def _truncate_text(text: str, limit: int) -> tuple[str, bool]:
+    if len(text) <= limit:
+        return text, False
+    return text[:limit], True
+
+
+def _resolve_exec_program(shell_name: str, command: str) -> list[str]:
+    if shell_name == "powershell":
+        return ["powershell", "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-Command", command]
+    if shell_name == "bash":
+        return ["bash", "-lc", command]
+    if shell_name == "cmd":
+        return ["cmd", "/c", command]
+    raise HTTPException(status_code=400, detail="unsupported shell")
+
+
+def _exec_command(req: ExecRequest) -> dict:
+    if not SETTINGS["exec_enabled"]:
+        raise HTTPException(status_code=403, detail="exec endpoint disabled")
+
+    run_dry = SETTINGS["dry_run"] or req.dry_run
+    shell_name = _pick_shell(req.shell)
+
+    timeout_s = req.timeout_s if req.timeout_s is not None else SETTINGS["exec_default_timeout_s"]
+    timeout_s = min(timeout_s, SETTINGS["exec_max_timeout_s"])
+
+    cwd = None
+    if req.cwd:
+        cwd = os.path.abspath(req.cwd)
+        if not os.path.isdir(cwd):
+            raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory")
+
+    argv = _resolve_exec_program(shell_name, req.command)
+
+    if run_dry:
+        return {
+            "executed": False,
+            "dry_run": True,
+            "shell": shell_name,
+            "command": req.command,
+            "argv": argv,
+            "timeout_s": timeout_s,
+            "cwd": cwd,
+        }
+
+    start = time.time()
+    try:
+        completed = subprocess.run(
+            argv,
+            cwd=cwd,
+            capture_output=True,
+            text=True,
+            timeout=timeout_s,
+            check=False,
+        )
+    except subprocess.TimeoutExpired as exc:
+        stdout = exc.stdout or ""
+        stderr = exc.stderr or ""
+        stdout, stdout_truncated = _truncate_text(str(stdout), SETTINGS["exec_max_output_chars"])
+        stderr, stderr_truncated = _truncate_text(str(stderr), SETTINGS["exec_max_output_chars"])
+        return {
+            "executed": True,
+            "timed_out": True,
+            "shell": shell_name,
+            "command": req.command,
+            "argv": argv,
+            "timeout_s": timeout_s,
+            "cwd": cwd,
+            "duration_ms": int((time.time() - start) * 1000),
+            "exit_code": None,
+            "stdout": stdout,
+            "stderr": stderr,
+            "stdout_truncated": stdout_truncated,
+            "stderr_truncated": stderr_truncated,
+        }
+    except FileNotFoundError as exc:
+        raise HTTPException(status_code=400, detail=f"shell executable not found: {exc}") from exc
+
+    stdout, stdout_truncated = _truncate_text(completed.stdout or "", SETTINGS["exec_max_output_chars"])
+    stderr, stderr_truncated = _truncate_text(completed.stderr or "", SETTINGS["exec_max_output_chars"])
+
+    return {
+        "executed": True,
+        "timed_out": False,
+        "shell": shell_name,
+        "command": req.command,
+        "argv": argv,
+        "timeout_s": timeout_s,
+        "cwd": cwd,
+        "duration_ms": int((time.time() - start) * 1000),
+        "exit_code": completed.returncode,
+        "stdout": stdout,
+        "stderr": stderr,
+        "stdout_truncated": stdout_truncated,
+        "stderr_truncated": stderr_truncated,
+    }
+
+
 def _exec_action(req: ActionRequest) -> dict:
    run_dry = SETTINGS["dry_run"] or req.dry_run

@@ -331,6 +450,12 @@ def health(_: None = Depends(_auth)):
        "request_id": _request_id(),
        "dry_run": SETTINGS["dry_run"],
        "allowed_region": SETTINGS["allowed_region"],
+        "exec": {
+            "enabled": SETTINGS["exec_enabled"],
+            "default_shell": SETTINGS["exec_default_shell"],
+            "default_timeout_s": SETTINGS["exec_default_timeout_s"],
+            "max_timeout_s": SETTINGS["exec_max_timeout_s"],
+        },
    }


@@ -449,6 +574,17 @@ def action(req: ActionRequest, _: None = Depends(_auth)):
    }


+@app.post("/exec")
+def exec_command(req: ExecRequest, _: None = Depends(_auth)):
+    result = _exec_command(req)
+    return {
+        "ok": True,
+        "request_id": _request_id(),
+        "time_ms": _now_ms(),
+        "result": result,
+    }
+
+
@app.post("/batch")
 def batch(req: BatchRequest, _: None = Depends(_auth)):
    results = []