From 48a145d147625b31e3cea76eae350721dfaafa69 Mon Sep 17 00:00:00 2001 From: Space-Banane Date: Wed, 27 May 2026 21:20:03 +0200 Subject: [PATCH] feat: support key combinations in press_key function and update related tests --- README.md | 1 + SKILL.md | 20 +++++++++++++++--- src/agent.py | 43 ++++++++++++++++++++++++++++++++++----- tests/test_agent_tools.py | 13 ++++++++++++ 4 files changed, 69 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 4d6f87f..0fa1929 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,7 @@ Each job payload includes: - Use `see_screen` before UI interaction. - Use `enhance` when text is unclear. - Use `press_key` for non-text keys (Enter, Tab, arrows, Escape). +- For shortcuts, use one `press_key` call with combo syntax (example: `win+r`). - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`. - When done, call: - `task_complete(return="...", data=...)` diff --git a/SKILL.md b/SKILL.md index 2f4b94a..89b649e 100644 --- a/SKILL.md +++ b/SKILL.md @@ -10,6 +10,7 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin - Mouse/keyboard control (`click`, `type`, `press_key`) - Terminal execution (`execute_command`, `sleep`) - Structured completion payload (`task_complete(return=..., data=...)`) +- Automatic final verification screen capture on completion - Safety gate, auth, history, and live monitoring ## Important Environment Note @@ -30,7 +31,12 @@ Agents can use ScreenJob to launch and control GUI workflows, including orchestr 1. Submit job via CLI or API. 2. Agent performs tool loop. -3. Read final `response.return` and `response.data` from job status. +3. Read final `response.return`, `response.data`, and `verification` from job status. + +Keyboard combo rule: + +- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`. +- Do not split modifier combos into separate calls. ## API Quick Reference @@ -79,10 +85,18 @@ Result contract in job payload: "status": "completed", "response": { "return": "Task completed successfully", - "data": "file1.txt\nfile2.txt" + "data": "file1.txt\nfile2.txt", + "verification": { + "ok": true, + "path": "C:/.../screens/screen_final_verification_step_006.png" + } }, "return": "Task completed successfully", - "data": "file1.txt\nfile2.txt" + "data": "file1.txt\nfile2.txt", + "verification": { + "ok": true, + "path": "C:/.../screens/screen_final_verification_step_006.png" + } } ``` diff --git a/src/agent.py b/src/agent.py index 834b3b9..dff6a1c 100644 --- a/src/agent.py +++ b/src/agent.py @@ -36,6 +36,7 @@ Rules: 4) Coordinates are absolute screen pixels (x, y) from top-left. 5) Use enhance(coordinate) when text/UI is unclear. 6) For keyboard-heavy interactions, prefer press_key for special keys. +6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls. 7) You may call multiple tools in one step. If needed, do click then sleep. 8) Never spam repeated clicks on the same coordinate; switch strategy. 9) Keep tool arguments valid JSON and concise. @@ -223,7 +224,10 @@ class ScreenJobAgent: { "type": "function", "name": "press_key", - "description": "Press a specific key (enter, tab, esc, arrows, etc).", + "description": ( + "Press a key or key combo. " + "For combos use plus syntax in one call (examples: 'win+r', 'ctrl+shift+esc')." + ), "parameters": { "type": "object", "properties": { @@ -483,20 +487,47 @@ class ScreenJobAgent: time.sleep(self.options.type_interval) return {"ok": True, "typed_length": len(text), "message": "Text typed."} + def _normalize_key_name(self, key: str) -> str: + normalized = key.strip().lower() + aliases = { + "windows": "win", + "windowskey": "win", + "meta": "win", + "super": "win", + "cmd": "command", + } + return aliases.get(normalized, normalized) + + def _parse_key_combo(self, raw: str) -> list[str]: + combo = raw.replace(" ", "") + if not combo: + return [] + parts = [self._normalize_key_name(part) for part in combo.split("+") if part.strip()] + return parts + def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]: key = str(args.get("key", "")).strip().lower() repeats = max(1, int(args.get("repeats", 1))) if not key: return {"ok": False, "error": "Missing key."} repeats = min(repeats, 50) - pressed = 0 + combo = self._parse_key_combo(key) + if not combo: + return {"ok": False, "error": "Invalid key."} + + executed = 0 for _ in range(repeats): if self._is_cancelled(): break - pyautogui.press(key) - pressed += 1 + if len(combo) == 1: + pyautogui.press(combo[0]) + else: + pyautogui.hotkey(*combo) + executed += 1 time.sleep(0.03) - return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."} + combo_text = "+".join(combo) + message = "Key combo executed." if len(combo) > 1 else "Key press executed." + return {"ok": True, "key": combo_text, "repeats": executed, "message": message} def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]: seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0) @@ -647,6 +678,7 @@ class ScreenJobAgent: "text": ( f"JOB: {job}\n" "You are in an action loop. Prefer execute_command for deterministic actions. " + "For modifier shortcuts, use a single press_key combo (example: win+r). " "You can return multiple tool calls in one step (example: click then sleep). " "When done call task_complete(return=..., data=...). " "Include useful structured output in data. " @@ -701,6 +733,7 @@ class ScreenJobAgent: "type": "input_text", "text": ( "No function call was returned. Continue by using tools. " + "Use one press_key call for key combos like win+r. " "You may call multiple tools in one step. " "When complete, call task_complete(return=..., data=...). " "Always finish with a final screen verification." diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py index 6a76615..0db146e 100644 --- a/tests/test_agent_tools.py +++ b/tests/test_agent_tools.py @@ -16,6 +16,7 @@ class _DummyPyAutoGUI: def __init__(self) -> None: self.last_move_to: tuple[int, int] | None = None self.last_click: tuple[int, int] | None = None + self.last_hotkey: tuple[str, ...] | None = None def screenshot(self) -> Image.Image: return Image.new("RGB", (1280, 720), color=(24, 24, 24)) @@ -35,6 +36,9 @@ class _DummyPyAutoGUI: def press(self, _: str) -> None: return None + def hotkey(self, *keys: str) -> None: + self.last_hotkey = tuple(keys) + def _build_agent(tmp_path: Path, monkeypatch) -> agent_module.ScreenJobAgent: dummy_gui = _DummyPyAutoGUI() @@ -89,3 +93,12 @@ def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None ) assert click_result["ok"] is True assert click_result["clicked"] == {"x": 110, "y": 102} + + +def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + result = agent._tool_press_key({"key": "meta+r"}) + assert result["ok"] is True + assert result["key"] == "win+r" + assert result["message"] == "Key combo executed." + assert agent_module.pyautogui.last_hotkey == ("win", "r")