feat: support key combinations in press_key function and update related tests

2026-05-27 21:20:03 +02:00
parent 278f011a6d
commit 48a145d147
4 changed files with 69 additions and 8 deletions
--- a/README.md
+++ b/README.md
@@ -170,6 +170,7 @@ Each job payload includes:
 - Use `see_screen` before UI interaction.
 - Use `enhance` when text is unclear.
 - Use `press_key` for non-text keys (Enter, Tab, arrows, Escape).
+- For shortcuts, use one `press_key` call with combo syntax (example: `win+r`).
 - Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
 - When done, call:
  - `task_complete(return="...", data=...)`
--- a/SKILL.md
+++ b/SKILL.md
@@ -10,6 +10,7 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin
 - Mouse/keyboard control (`click`, `type`, `press_key`)
 - Terminal execution (`execute_command`, `sleep`)
 - Structured completion payload (`task_complete(return=..., data=...)`)
+- Automatic final verification screen capture on completion
 - Safety gate, auth, history, and live monitoring

 ## Important Environment Note
@@ -30,7 +31,12 @@ Agents can use ScreenJob to launch and control GUI workflows, including orchestr

 1. Submit job via CLI or API.
 2. Agent performs tool loop.
-3. Read final `response.return` and `response.data` from job status.
+3. Read final `response.return`, `response.data`, and `verification` from job status.
+
+Keyboard combo rule:
+
+- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
+- Do not split modifier combos into separate calls.

 ## API Quick Reference

@@ -79,10 +85,18 @@ Result contract in job payload:
  "status": "completed",
  "response": {
    "return": "Task completed successfully",
-    "data": "file1.txt\nfile2.txt"
+    "data": "file1.txt\nfile2.txt",
+    "verification": {
+      "ok": true,
+      "path": "C:/.../screens/screen_final_verification_step_006.png"
+    }
  },
  "return": "Task completed successfully",
-  "data": "file1.txt\nfile2.txt"
+  "data": "file1.txt\nfile2.txt",
+  "verification": {
+    "ok": true,
+    "path": "C:/.../screens/screen_final_verification_step_006.png"
+  }
 }
 ```

--- a/src/agent.py
+++ b/src/agent.py
@@ -36,6 +36,7 @@ Rules:
 4) Coordinates are absolute screen pixels (x, y) from top-left.
 5) Use enhance(coordinate) when text/UI is unclear.
 6) For keyboard-heavy interactions, prefer press_key for special keys.
+6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls.
 7) You may call multiple tools in one step. If needed, do click then sleep.
 8) Never spam repeated clicks on the same coordinate; switch strategy.
 9) Keep tool arguments valid JSON and concise.
@@ -223,7 +224,10 @@ class ScreenJobAgent:
            {
                "type": "function",
                "name": "press_key",
-                "description": "Press a specific key (enter, tab, esc, arrows, etc).",
+                "description": (
+                    "Press a key or key combo. "
+                    "For combos use plus syntax in one call (examples: 'win+r', 'ctrl+shift+esc')."
+                ),
                "parameters": {
                    "type": "object",
                    "properties": {
@@ -483,20 +487,47 @@ class ScreenJobAgent:
            time.sleep(self.options.type_interval)
        return {"ok": True, "typed_length": len(text), "message": "Text typed."}

+    def _normalize_key_name(self, key: str) -> str:
+        normalized = key.strip().lower()
+        aliases = {
+            "windows": "win",
+            "windowskey": "win",
+            "meta": "win",
+            "super": "win",
+            "cmd": "command",
+        }
+        return aliases.get(normalized, normalized)
+
+    def _parse_key_combo(self, raw: str) -> list[str]:
+        combo = raw.replace(" ", "")
+        if not combo:
+            return []
+        parts = [self._normalize_key_name(part) for part in combo.split("+") if part.strip()]
+        return parts
+
    def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
        key = str(args.get("key", "")).strip().lower()
        repeats = max(1, int(args.get("repeats", 1)))
        if not key:
            return {"ok": False, "error": "Missing key."}
        repeats = min(repeats, 50)
-        pressed = 0
+        combo = self._parse_key_combo(key)
+        if not combo:
+            return {"ok": False, "error": "Invalid key."}
+
+        executed = 0
        for _ in range(repeats):
            if self._is_cancelled():
                break
-            pyautogui.press(key)
-            pressed += 1
+            if len(combo) == 1:
+                pyautogui.press(combo[0])
+            else:
+                pyautogui.hotkey(*combo)
+            executed += 1
            time.sleep(0.03)
-        return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}
+        combo_text = "+".join(combo)
+        message = "Key combo executed." if len(combo) > 1 else "Key press executed."
+        return {"ok": True, "key": combo_text, "repeats": executed, "message": message}

    def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
        seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
@@ -647,6 +678,7 @@ class ScreenJobAgent:
                        "text": (
                            f"JOB: {job}\n"
                            "You are in an action loop. Prefer execute_command for deterministic actions. "
+                            "For modifier shortcuts, use a single press_key combo (example: win+r). "
                            "You can return multiple tool calls in one step (example: click then sleep). "
                            "When done call task_complete(return=..., data=...). "
                            "Include useful structured output in data. "
@@ -701,6 +733,7 @@ class ScreenJobAgent:
                                "type": "input_text",
                                "text": (
                                    "No function call was returned. Continue by using tools. "
+                                    "Use one press_key call for key combos like win+r. "
                                    "You may call multiple tools in one step. "
                                    "When complete, call task_complete(return=..., data=...). "
                                    "Always finish with a final screen verification."
--- a/tests/test_agent_tools.py
+++ b/tests/test_agent_tools.py
@@ -16,6 +16,7 @@ class _DummyPyAutoGUI:
    def __init__(self) -> None:
        self.last_move_to: tuple[int, int] | None = None
        self.last_click: tuple[int, int] | None = None
+        self.last_hotkey: tuple[str, ...] | None = None

    def screenshot(self) -> Image.Image:
        return Image.new("RGB", (1280, 720), color=(24, 24, 24))
@@ -35,6 +36,9 @@ class _DummyPyAutoGUI:
    def press(self, _: str) -> None:
        return None

+    def hotkey(self, *keys: str) -> None:
+        self.last_hotkey = tuple(keys)
+

 def _build_agent(tmp_path: Path, monkeypatch) -> agent_module.ScreenJobAgent:
    dummy_gui = _DummyPyAutoGUI()
@@ -89,3 +93,12 @@ def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None
    )
    assert click_result["ok"] is True
    assert click_result["clicked"] == {"x": 110, "y": 102}
+
+
+def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None:
+    agent = _build_agent(tmp_path, monkeypatch)
+    result = agent._tool_press_key({"key": "meta+r"})
+    assert result["ok"] is True
+    assert result["key"] == "win+r"
+    assert result["message"] == "Key combo executed."
+    assert agent_module.pyautogui.last_hotkey == ("win", "r")