feat: support key combinations in press_key function and update related tests
This commit is contained in:
@@ -170,6 +170,7 @@ Each job payload includes:
|
||||
- Use `see_screen` before UI interaction.
|
||||
- Use `enhance` when text is unclear.
|
||||
- Use `press_key` for non-text keys (Enter, Tab, arrows, Escape).
|
||||
- For shortcuts, use one `press_key` call with combo syntax (example: `win+r`).
|
||||
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
|
||||
- When done, call:
|
||||
- `task_complete(return="...", data=...)`
|
||||
|
||||
20
SKILL.md
20
SKILL.md
@@ -10,6 +10,7 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin
|
||||
- Mouse/keyboard control (`click`, `type`, `press_key`)
|
||||
- Terminal execution (`execute_command`, `sleep`)
|
||||
- Structured completion payload (`task_complete(return=..., data=...)`)
|
||||
- Automatic final verification screen capture on completion
|
||||
- Safety gate, auth, history, and live monitoring
|
||||
|
||||
## Important Environment Note
|
||||
@@ -30,7 +31,12 @@ Agents can use ScreenJob to launch and control GUI workflows, including orchestr
|
||||
|
||||
1. Submit job via CLI or API.
|
||||
2. Agent performs tool loop.
|
||||
3. Read final `response.return` and `response.data` from job status.
|
||||
3. Read final `response.return`, `response.data`, and `verification` from job status.
|
||||
|
||||
Keyboard combo rule:
|
||||
|
||||
- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
|
||||
- Do not split modifier combos into separate calls.
|
||||
|
||||
## API Quick Reference
|
||||
|
||||
@@ -79,10 +85,18 @@ Result contract in job payload:
|
||||
"status": "completed",
|
||||
"response": {
|
||||
"return": "Task completed successfully",
|
||||
"data": "file1.txt\nfile2.txt"
|
||||
"data": "file1.txt\nfile2.txt",
|
||||
"verification": {
|
||||
"ok": true,
|
||||
"path": "C:/.../screens/screen_final_verification_step_006.png"
|
||||
}
|
||||
},
|
||||
"return": "Task completed successfully",
|
||||
"data": "file1.txt\nfile2.txt"
|
||||
"data": "file1.txt\nfile2.txt",
|
||||
"verification": {
|
||||
"ok": true,
|
||||
"path": "C:/.../screens/screen_final_verification_step_006.png"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
43
src/agent.py
43
src/agent.py
@@ -36,6 +36,7 @@ Rules:
|
||||
4) Coordinates are absolute screen pixels (x, y) from top-left.
|
||||
5) Use enhance(coordinate) when text/UI is unclear.
|
||||
6) For keyboard-heavy interactions, prefer press_key for special keys.
|
||||
6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls.
|
||||
7) You may call multiple tools in one step. If needed, do click then sleep.
|
||||
8) Never spam repeated clicks on the same coordinate; switch strategy.
|
||||
9) Keep tool arguments valid JSON and concise.
|
||||
@@ -223,7 +224,10 @@ class ScreenJobAgent:
|
||||
{
|
||||
"type": "function",
|
||||
"name": "press_key",
|
||||
"description": "Press a specific key (enter, tab, esc, arrows, etc).",
|
||||
"description": (
|
||||
"Press a key or key combo. "
|
||||
"For combos use plus syntax in one call (examples: 'win+r', 'ctrl+shift+esc')."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -483,20 +487,47 @@ class ScreenJobAgent:
|
||||
time.sleep(self.options.type_interval)
|
||||
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
|
||||
|
||||
def _normalize_key_name(self, key: str) -> str:
|
||||
normalized = key.strip().lower()
|
||||
aliases = {
|
||||
"windows": "win",
|
||||
"windowskey": "win",
|
||||
"meta": "win",
|
||||
"super": "win",
|
||||
"cmd": "command",
|
||||
}
|
||||
return aliases.get(normalized, normalized)
|
||||
|
||||
def _parse_key_combo(self, raw: str) -> list[str]:
|
||||
combo = raw.replace(" ", "")
|
||||
if not combo:
|
||||
return []
|
||||
parts = [self._normalize_key_name(part) for part in combo.split("+") if part.strip()]
|
||||
return parts
|
||||
|
||||
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
key = str(args.get("key", "")).strip().lower()
|
||||
repeats = max(1, int(args.get("repeats", 1)))
|
||||
if not key:
|
||||
return {"ok": False, "error": "Missing key."}
|
||||
repeats = min(repeats, 50)
|
||||
pressed = 0
|
||||
combo = self._parse_key_combo(key)
|
||||
if not combo:
|
||||
return {"ok": False, "error": "Invalid key."}
|
||||
|
||||
executed = 0
|
||||
for _ in range(repeats):
|
||||
if self._is_cancelled():
|
||||
break
|
||||
pyautogui.press(key)
|
||||
pressed += 1
|
||||
if len(combo) == 1:
|
||||
pyautogui.press(combo[0])
|
||||
else:
|
||||
pyautogui.hotkey(*combo)
|
||||
executed += 1
|
||||
time.sleep(0.03)
|
||||
return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}
|
||||
combo_text = "+".join(combo)
|
||||
message = "Key combo executed." if len(combo) > 1 else "Key press executed."
|
||||
return {"ok": True, "key": combo_text, "repeats": executed, "message": message}
|
||||
|
||||
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
|
||||
@@ -647,6 +678,7 @@ class ScreenJobAgent:
|
||||
"text": (
|
||||
f"JOB: {job}\n"
|
||||
"You are in an action loop. Prefer execute_command for deterministic actions. "
|
||||
"For modifier shortcuts, use a single press_key combo (example: win+r). "
|
||||
"You can return multiple tool calls in one step (example: click then sleep). "
|
||||
"When done call task_complete(return=..., data=...). "
|
||||
"Include useful structured output in data. "
|
||||
@@ -701,6 +733,7 @@ class ScreenJobAgent:
|
||||
"type": "input_text",
|
||||
"text": (
|
||||
"No function call was returned. Continue by using tools. "
|
||||
"Use one press_key call for key combos like win+r. "
|
||||
"You may call multiple tools in one step. "
|
||||
"When complete, call task_complete(return=..., data=...). "
|
||||
"Always finish with a final screen verification."
|
||||
|
||||
@@ -16,6 +16,7 @@ class _DummyPyAutoGUI:
|
||||
def __init__(self) -> None:
|
||||
self.last_move_to: tuple[int, int] | None = None
|
||||
self.last_click: tuple[int, int] | None = None
|
||||
self.last_hotkey: tuple[str, ...] | None = None
|
||||
|
||||
def screenshot(self) -> Image.Image:
|
||||
return Image.new("RGB", (1280, 720), color=(24, 24, 24))
|
||||
@@ -35,6 +36,9 @@ class _DummyPyAutoGUI:
|
||||
def press(self, _: str) -> None:
|
||||
return None
|
||||
|
||||
def hotkey(self, *keys: str) -> None:
|
||||
self.last_hotkey = tuple(keys)
|
||||
|
||||
|
||||
def _build_agent(tmp_path: Path, monkeypatch) -> agent_module.ScreenJobAgent:
|
||||
dummy_gui = _DummyPyAutoGUI()
|
||||
@@ -89,3 +93,12 @@ def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None
|
||||
)
|
||||
assert click_result["ok"] is True
|
||||
assert click_result["clicked"] == {"x": 110, "y": 102}
|
||||
|
||||
|
||||
def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None:
|
||||
agent = _build_agent(tmp_path, monkeypatch)
|
||||
result = agent._tool_press_key({"key": "meta+r"})
|
||||
assert result["ok"] is True
|
||||
assert result["key"] == "win+r"
|
||||
assert result["message"] == "Key combo executed."
|
||||
assert agent_module.pyautogui.last_hotkey == ("win", "r")
|
||||
|
||||
Reference in New Issue
Block a user