From 493e5499e8e60462140879c53619687277b9c149 Mon Sep 17 00:00:00 2001 From: Luna Date: Fri, 1 May 2026 15:52:02 +0200 Subject: [PATCH] feat(window): add window lifecycle and launch endpoints --- README.md | 6 +- docs/API.md | 83 ++++++++++++++ server/app.py | 290 +++++++++++++++++++++++++++++++++++++++++++++++++ skill/SKILL.md | 5 +- 4 files changed, 382 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bd29d03..0dadc4f 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ Let an Agent interact with your computer over HTTP, with grid-aware screenshots - **Zoom endpoint**: crop around a point with denser grid for fine targeting (`asImage=true` supported) - **Multi-display support**: list displays with `GET /displays` and select one with `?screen=0`, `?screen=1`, ... - **Action endpoints**: move/click/right-click/double-click/middle-click/scroll/type/hotkey +- **Window lifecycle endpoints**: list/focus/restore/minimize/maximize/close windows via `GET /windows` + `POST /windows/action` +- **Structured launch endpoint**: start an app/process without dropping to a shell via `POST /launch` - **OCR endpoint**: extract text blocks with bounding boxes via `POST /ocr` - **Command execution endpoint**: run PowerShell/Bash/CMD commands via `POST /exec` - **Coordinate transform metadata** in visual responses so agents can map grid cells to real pixels @@ -41,7 +43,7 @@ For OCR support, install the native `tesseract` binary on the host (in addition Important: - `POST /action` expects an `action` plus a `target` object; do not send raw top-level `x` / `y` fields. - Pixel coordinates and OCR bounding boxes are always global desktop coordinates. -- Prefer structured GUI interaction first; use `/exec` for launch, recovery, or explicit system-level tasks. +- Prefer structured GUI interaction first; use `/windows`, `/launch`, and `/action` before reaching for `/exec`. See: - `docs/API.md` @@ -67,6 +69,8 @@ Environment variables: - `CLICKTHROUGH_EXEC_MAX_OUTPUT_CHARS` (default `20000`) - `CLICKTHROUGH_TESSERACT_CMD` (optional path to the `tesseract` executable) +Window management endpoints currently target Windows hosts. On non-Windows hosts they return `501` instead of guessing. + ## Gitea CI A Gitea Actions workflow is included at `.gitea/workflows/python-syntax.yml`. diff --git a/docs/API.md b/docs/API.md index dfbb104..c8a72b1 100644 --- a/docs/API.md +++ b/docs/API.md @@ -194,6 +194,89 @@ Move only: } ``` +## `GET /windows` + +List desktop windows using structured filters instead of shelling out. + +Query params: + +- `title_contains` (optional substring match) +- `title_regex` (optional case-insensitive regex) +- `process_name` (optional exact process name, e.g. `explorer.exe`) +- `hwnd` (optional exact window handle) +- `visible_only` (bool, default `true`) + +```json +{ + "ok": true, + "count": 1, + "windows": [ + { + "hwnd": 132640, + "title": "WinDirStat", + "class_name": "WinDirStatMainWindow", + "pid": 18420, + "process_name": "windirstat.exe", + "visible": true, + "enabled": true, + "minimized": false, + "maximized": false, + "foreground": true, + "rect": {"x": 194, "y": 116, "width": 1532, "height": 870} + } + ] +} +``` + +Notes: +- Currently supported on Windows hosts only. +- Returns `409` for ambiguous write-target matches when a mutation endpoint would affect multiple windows. + +## `POST /windows/action` + +Perform a structured window action against exactly one matched window. + +```json +{ + "action": "focus", + "title_contains": "WinDirStat", + "visible_only": true, + "timeout_ms": 3000 +} +``` + +Supported actions: +- `focus` +- `restore` +- `minimize` +- `maximize` +- `close` + +The response includes the matched pre-action window and the final observed window state (or `closed=true` if it disappeared). + +## `POST /launch` + +Start an app/process without invoking a shell. + +```json +{ + "executable": "C:/Program Files/WinDirStat/WinDirStat.exe", + "args": [], + "cwd": "C:/Program Files/WinDirStat", + "wait_for_window": true, + "match": { + "title_contains": "WinDirStat", + "visible_only": true + }, + "timeout_ms": 8000 +} +``` + +Notes: +- Launch uses direct process execution (`subprocess.Popen`) rather than PowerShell/CMD. +- If `wait_for_window=true`, the server polls for a matching window and returns `window_found`. +- `dry_run=true` returns the resolved argv/cwd without launching. + ## `POST /ocr` Extract visible text from either a full screenshot, a region crop, or caller-provided image bytes. diff --git a/server/app.py b/server/app.py index fe19ce1..39610e8 100644 --- a/server/app.py +++ b/server/app.py @@ -1,8 +1,10 @@ import base64 +import ctypes import hmac import io import os import subprocess +import sys import time import uuid from typing import Literal, Optional @@ -168,6 +170,31 @@ class OCRRequest(BaseModel): return self +class WindowQuery(BaseModel): + title_contains: str | None = Field(default=None, max_length=512) + title_regex: str | None = Field(default=None, max_length=512) + process_name: str | None = Field(default=None, max_length=260) + hwnd: int | None = Field(default=None, ge=1) + visible_only: bool = True + + +class WindowActionRequest(WindowQuery): + action: Literal["focus", "restore", "minimize", "maximize", "close"] + timeout_ms: int = Field(default=3000, ge=0, le=60000) + + +class LaunchRequest(BaseModel): + executable: str = Field(min_length=1, max_length=2048) + args: list[str] = Field(default_factory=list, max_length=100) + cwd: str | None = None + wait_for_window: bool = False + match: WindowQuery | None = None + timeout_ms: int = Field(default=5000, ge=0, le=120000) + dry_run: bool = False + + + + def _auth(x_clickthrough_token: Optional[str] = Header(default=None)): token = SETTINGS["token"] if token and x_clickthrough_token != token: @@ -456,6 +483,221 @@ def _run_ocr(image, language_hint: str | None, min_confidence: float, offset_x: return blocks +def _windows_only(feature: str): + if sys.platform != "win32": + raise HTTPException(status_code=501, detail=f"{feature} is currently supported on Windows hosts only") + + +def _tasklist_process_name(pid: int) -> str | None: + try: + completed = subprocess.run( + ["tasklist", "/FI", f"PID eq {pid}", "/FO", "CSV", "/NH"], + capture_output=True, + text=True, + timeout=5, + check=False, + ) + except Exception: + return None + + line = (completed.stdout or "").strip().splitlines() + if not line: + return None + row = line[0].strip() + if not row or row.startswith("INFO:"): + return None + if row.startswith('"') and '","' in row: + return row.split('","', 1)[0].strip('"') + return None + + +def _list_windows(query: WindowQuery | None = None) -> list[dict]: + _windows_only("window endpoints") + + user32 = ctypes.windll.user32 + user32.EnumWindows.restype = ctypes.c_bool + user32.EnumWindows.argtypes = [ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p), ctypes.c_void_p] + user32.IsWindowVisible.argtypes = [ctypes.c_void_p] + user32.IsWindowVisible.restype = ctypes.c_bool + user32.IsWindowEnabled.argtypes = [ctypes.c_void_p] + user32.IsWindowEnabled.restype = ctypes.c_bool + user32.IsIconic.argtypes = [ctypes.c_void_p] + user32.IsIconic.restype = ctypes.c_bool + user32.IsZoomed.argtypes = [ctypes.c_void_p] + user32.IsZoomed.restype = ctypes.c_bool + user32.GetWindowTextLengthW.argtypes = [ctypes.c_void_p] + user32.GetWindowTextLengthW.restype = ctypes.c_int + user32.GetWindowTextW.argtypes = [ctypes.c_void_p, ctypes.c_wchar_p, ctypes.c_int] + user32.GetClassNameW.argtypes = [ctypes.c_void_p, ctypes.c_wchar_p, ctypes.c_int] + user32.GetClassNameW.restype = ctypes.c_int + user32.GetForegroundWindow.restype = ctypes.c_void_p + user32.GetWindowRect.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.wintypes.RECT)] + + foreground = int(user32.GetForegroundWindow() or 0) + title_regex = re.compile(query.title_regex, re.IGNORECASE) if query and query.title_regex else None + windows: list[dict] = [] + + enum_proc = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p) + + def _callback(hwnd, _lparam): + hwnd_int = int(hwnd) + if query and query.hwnd is not None and hwnd_int != query.hwnd: + return True + + title_len = user32.GetWindowTextLengthW(hwnd) + title_buf = ctypes.create_unicode_buffer(max(title_len + 1, 1)) + user32.GetWindowTextW(hwnd, title_buf, len(title_buf)) + title = title_buf.value + + visible = bool(user32.IsWindowVisible(hwnd)) + if query and query.visible_only and not visible: + return True + + class_buf = ctypes.create_unicode_buffer(256) + user32.GetClassNameW(hwnd, class_buf, len(class_buf)) + + pid = ctypes.wintypes.DWORD() + user32.GetWindowThreadProcessId(hwnd, ctypes.byref(pid)) + process_name = _tasklist_process_name(int(pid.value)) + + rect = ctypes.wintypes.RECT() + user32.GetWindowRect(hwnd, ctypes.byref(rect)) + + window = { + "hwnd": hwnd_int, + "title": title, + "class_name": class_buf.value, + "pid": int(pid.value), + "process_name": process_name, + "visible": visible, + "enabled": bool(user32.IsWindowEnabled(hwnd)), + "minimized": bool(user32.IsIconic(hwnd)), + "maximized": bool(user32.IsZoomed(hwnd)), + "foreground": hwnd_int == foreground, + "rect": { + "x": int(rect.left), + "y": int(rect.top), + "width": int(rect.right - rect.left), + "height": int(rect.bottom - rect.top), + }, + } + + if query: + if query.title_contains and query.title_contains.lower() not in title.lower(): + return True + if title_regex and not title_regex.search(title): + return True + if query.process_name and (process_name or "").lower() != query.process_name.lower(): + return True + + windows.append(window) + return True + + user32.EnumWindows(enum_proc(_callback), 0) + windows.sort(key=lambda item: (not item["foreground"], item["title"].lower(), item["hwnd"])) + return windows + + +def _require_window_match(query: WindowQuery) -> dict: + matches = _list_windows(query) + if not matches: + raise HTTPException(status_code=404, detail="no matching window found") + if len(matches) > 1 and query.hwnd is None: + raise HTTPException( + status_code=409, + detail={"message": "multiple windows matched", "matches": matches[:10]}, + ) + return matches[0] + + +def _apply_window_action(req: WindowActionRequest) -> dict: + _windows_only("window endpoints") + match = _require_window_match(req) + hwnd = match["hwnd"] + user32 = ctypes.windll.user32 + WM_CLOSE = 0x0010 + SW_RESTORE = 9 + SW_MINIMIZE = 6 + SW_MAXIMIZE = 3 + + if req.action in {"focus", "restore"}: + user32.ShowWindow(hwnd, SW_RESTORE) + ok = bool(user32.SetForegroundWindow(hwnd)) + elif req.action == "minimize": + ok = bool(user32.ShowWindow(hwnd, SW_MINIMIZE)) + elif req.action == "maximize": + ok = bool(user32.ShowWindow(hwnd, SW_MAXIMIZE)) + elif req.action == "close": + ok = bool(user32.PostMessageW(hwnd, WM_CLOSE, 0, 0)) + else: + raise HTTPException(status_code=400, detail="unsupported window action") + + deadline = time.time() + (req.timeout_ms / 1000.0) + final_match = None + while time.time() <= deadline: + current = _list_windows(WindowQuery(hwnd=hwnd, visible_only=False)) + final_match = current[0] if current else None + if req.action == "close" and final_match is None: + break + if req.action in {"focus", "restore"} and final_match and final_match["foreground"] and not final_match["minimized"]: + break + if req.action == "minimize" and final_match and final_match["minimized"]: + break + if req.action == "maximize" and final_match and final_match["maximized"]: + break + time.sleep(0.1) + + return { + "ok": ok, + "matched": match, + "window": final_match, + "closed": final_match is None, + } + + +def _launch_app(req: LaunchRequest) -> dict: + if req.cwd: + cwd = os.path.abspath(req.cwd) + if not os.path.isdir(cwd): + raise HTTPException(status_code=400, detail="cwd does not exist or is not a directory") + else: + cwd = None + + argv = [req.executable, *req.args] + if SETTINGS["dry_run"] or req.dry_run: + return {"executed": False, "dry_run": True, "argv": argv, "cwd": cwd} + + try: + proc = subprocess.Popen(argv, cwd=cwd) + except FileNotFoundError as exc: + raise HTTPException(status_code=400, detail=f"executable not found: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=400, detail=f"failed to launch process: {exc}") from exc + + result = { + "executed": True, + "dry_run": False, + "argv": argv, + "cwd": cwd, + "pid": proc.pid, + } + + if req.wait_for_window: + query = req.match or WindowQuery(process_name=os.path.basename(req.executable), visible_only=True) + deadline = time.time() + (req.timeout_ms / 1000.0) + match = None + while time.time() <= deadline: + matches = _list_windows(query) + if matches: + match = matches[0] + break + time.sleep(0.2) + result["window"] = match + result["window_found"] = match is not None + + return result + + def _pick_shell(explicit_shell: str | None) -> str: shell_name = (explicit_shell or SETTINGS["exec_default_shell"] or "powershell").lower().strip() if shell_name not in {"powershell", "bash", "cmd"}: @@ -799,6 +1041,54 @@ def exec_command( } +@app.get("/windows") +def windows( + title_contains: str | None = None, + title_regex: str | None = None, + process_name: str | None = None, + hwnd: int | None = None, + visible_only: bool = True, + _: None = Depends(_auth), +): + query = WindowQuery( + title_contains=title_contains, + title_regex=title_regex, + process_name=process_name, + hwnd=hwnd, + visible_only=visible_only, + ) + matches = _list_windows(query) + return { + "ok": True, + "request_id": _request_id(), + "time_ms": _now_ms(), + "windows": matches, + "count": len(matches), + } + + +@app.post("/windows/action") +def window_action(req: WindowActionRequest, _: None = Depends(_auth)): + result = _apply_window_action(req) + return { + "ok": True, + "request_id": _request_id(), + "time_ms": _now_ms(), + "result": result, + } + + +@app.post("/launch") +def launch(req: LaunchRequest, _: None = Depends(_auth)): + result = _launch_app(req) + return { + "ok": True, + "request_id": _request_id(), + "time_ms": _now_ms(), + "result": result, + } + + @app.post("/ocr") def ocr(req: OCRRequest, screen: int = 0, _: None = Depends(_auth)): source = req.mode diff --git a/skill/SKILL.md b/skill/SKILL.md index 548163a..356427d 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -36,6 +36,9 @@ The agent should not assume it can self-install this stack. - `GET /displays` → detected displays in zero-based API order - `GET /screen?screen=0` → full screenshot (JSON with base64 by default, or raw image with `asImage=true`) - `POST /zoom?screen=0` → cropped screenshot around point/region (also supports `asImage=true`) +- `GET /windows` → discover visible desktop windows and their handles/processes +- `POST /windows/action` → focus/restore/minimize/maximize/close a matched window +- `POST /launch` → start an app/process without dropping to a shell - `POST /ocr` → text extraction with bounding boxes from full screen, region, or provided image bytes - `POST /action?screen=0` → single interaction (`move`, `click`, `scroll`, `type`, `hotkey`, ...) - `POST /batch?screen=0` → sequential action list @@ -123,11 +126,11 @@ Prefer structured GUI control first: - `/action` or `/batch` to interact Use `/exec` only when it is the cleanest available tool for the job, for example: -- launching an app that is not already visible - querying machine state that the GUI does not expose well - performing an explicit user-requested shell/system task - recovering from a blocked GUI flow when normal interaction failed +Prefer `GET /windows`, `POST /windows/action`, and `POST /launch` for app lifecycle tasks before falling back to `/exec`. Avoid using `/exec` for routine in-app clicks, menu navigation, or text entry when the GUI can be driven directly. ## Core workflow (mandatory)