feat: support key combinations in press_key function and update related tests
This commit is contained in:
@@ -170,6 +170,7 @@ Each job payload includes:
|
|||||||
- Use `see_screen` before UI interaction.
|
- Use `see_screen` before UI interaction.
|
||||||
- Use `enhance` when text is unclear.
|
- Use `enhance` when text is unclear.
|
||||||
- Use `press_key` for non-text keys (Enter, Tab, arrows, Escape).
|
- Use `press_key` for non-text keys (Enter, Tab, arrows, Escape).
|
||||||
|
- For shortcuts, use one `press_key` call with combo syntax (example: `win+r`).
|
||||||
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
|
- Use `click` offsets via `offset_up/down/left/right` and optional `sleep_after_seconds`.
|
||||||
- When done, call:
|
- When done, call:
|
||||||
- `task_complete(return="...", data=...)`
|
- `task_complete(return="...", data=...)`
|
||||||
|
|||||||
20
SKILL.md
20
SKILL.md
@@ -10,6 +10,7 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin
|
|||||||
- Mouse/keyboard control (`click`, `type`, `press_key`)
|
- Mouse/keyboard control (`click`, `type`, `press_key`)
|
||||||
- Terminal execution (`execute_command`, `sleep`)
|
- Terminal execution (`execute_command`, `sleep`)
|
||||||
- Structured completion payload (`task_complete(return=..., data=...)`)
|
- Structured completion payload (`task_complete(return=..., data=...)`)
|
||||||
|
- Automatic final verification screen capture on completion
|
||||||
- Safety gate, auth, history, and live monitoring
|
- Safety gate, auth, history, and live monitoring
|
||||||
|
|
||||||
## Important Environment Note
|
## Important Environment Note
|
||||||
@@ -30,7 +31,12 @@ Agents can use ScreenJob to launch and control GUI workflows, including orchestr
|
|||||||
|
|
||||||
1. Submit job via CLI or API.
|
1. Submit job via CLI or API.
|
||||||
2. Agent performs tool loop.
|
2. Agent performs tool loop.
|
||||||
3. Read final `response.return` and `response.data` from job status.
|
3. Read final `response.return`, `response.data`, and `verification` from job status.
|
||||||
|
|
||||||
|
Keyboard combo rule:
|
||||||
|
|
||||||
|
- For shortcuts, use one `press_key` call with combo syntax, for example: `win+r`, `ctrl+shift+esc`.
|
||||||
|
- Do not split modifier combos into separate calls.
|
||||||
|
|
||||||
## API Quick Reference
|
## API Quick Reference
|
||||||
|
|
||||||
@@ -79,10 +85,18 @@ Result contract in job payload:
|
|||||||
"status": "completed",
|
"status": "completed",
|
||||||
"response": {
|
"response": {
|
||||||
"return": "Task completed successfully",
|
"return": "Task completed successfully",
|
||||||
"data": "file1.txt\nfile2.txt"
|
"data": "file1.txt\nfile2.txt",
|
||||||
|
"verification": {
|
||||||
|
"ok": true,
|
||||||
|
"path": "C:/.../screens/screen_final_verification_step_006.png"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"return": "Task completed successfully",
|
"return": "Task completed successfully",
|
||||||
"data": "file1.txt\nfile2.txt"
|
"data": "file1.txt\nfile2.txt",
|
||||||
|
"verification": {
|
||||||
|
"ok": true,
|
||||||
|
"path": "C:/.../screens/screen_final_verification_step_006.png"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
43
src/agent.py
43
src/agent.py
@@ -36,6 +36,7 @@ Rules:
|
|||||||
4) Coordinates are absolute screen pixels (x, y) from top-left.
|
4) Coordinates are absolute screen pixels (x, y) from top-left.
|
||||||
5) Use enhance(coordinate) when text/UI is unclear.
|
5) Use enhance(coordinate) when text/UI is unclear.
|
||||||
6) For keyboard-heavy interactions, prefer press_key for special keys.
|
6) For keyboard-heavy interactions, prefer press_key for special keys.
|
||||||
|
6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls.
|
||||||
7) You may call multiple tools in one step. If needed, do click then sleep.
|
7) You may call multiple tools in one step. If needed, do click then sleep.
|
||||||
8) Never spam repeated clicks on the same coordinate; switch strategy.
|
8) Never spam repeated clicks on the same coordinate; switch strategy.
|
||||||
9) Keep tool arguments valid JSON and concise.
|
9) Keep tool arguments valid JSON and concise.
|
||||||
@@ -223,7 +224,10 @@ class ScreenJobAgent:
|
|||||||
{
|
{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"name": "press_key",
|
"name": "press_key",
|
||||||
"description": "Press a specific key (enter, tab, esc, arrows, etc).",
|
"description": (
|
||||||
|
"Press a key or key combo. "
|
||||||
|
"For combos use plus syntax in one call (examples: 'win+r', 'ctrl+shift+esc')."
|
||||||
|
),
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
@@ -483,20 +487,47 @@ class ScreenJobAgent:
|
|||||||
time.sleep(self.options.type_interval)
|
time.sleep(self.options.type_interval)
|
||||||
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
|
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
|
||||||
|
|
||||||
|
def _normalize_key_name(self, key: str) -> str:
|
||||||
|
normalized = key.strip().lower()
|
||||||
|
aliases = {
|
||||||
|
"windows": "win",
|
||||||
|
"windowskey": "win",
|
||||||
|
"meta": "win",
|
||||||
|
"super": "win",
|
||||||
|
"cmd": "command",
|
||||||
|
}
|
||||||
|
return aliases.get(normalized, normalized)
|
||||||
|
|
||||||
|
def _parse_key_combo(self, raw: str) -> list[str]:
|
||||||
|
combo = raw.replace(" ", "")
|
||||||
|
if not combo:
|
||||||
|
return []
|
||||||
|
parts = [self._normalize_key_name(part) for part in combo.split("+") if part.strip()]
|
||||||
|
return parts
|
||||||
|
|
||||||
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
|
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||||
key = str(args.get("key", "")).strip().lower()
|
key = str(args.get("key", "")).strip().lower()
|
||||||
repeats = max(1, int(args.get("repeats", 1)))
|
repeats = max(1, int(args.get("repeats", 1)))
|
||||||
if not key:
|
if not key:
|
||||||
return {"ok": False, "error": "Missing key."}
|
return {"ok": False, "error": "Missing key."}
|
||||||
repeats = min(repeats, 50)
|
repeats = min(repeats, 50)
|
||||||
pressed = 0
|
combo = self._parse_key_combo(key)
|
||||||
|
if not combo:
|
||||||
|
return {"ok": False, "error": "Invalid key."}
|
||||||
|
|
||||||
|
executed = 0
|
||||||
for _ in range(repeats):
|
for _ in range(repeats):
|
||||||
if self._is_cancelled():
|
if self._is_cancelled():
|
||||||
break
|
break
|
||||||
pyautogui.press(key)
|
if len(combo) == 1:
|
||||||
pressed += 1
|
pyautogui.press(combo[0])
|
||||||
|
else:
|
||||||
|
pyautogui.hotkey(*combo)
|
||||||
|
executed += 1
|
||||||
time.sleep(0.03)
|
time.sleep(0.03)
|
||||||
return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}
|
combo_text = "+".join(combo)
|
||||||
|
message = "Key combo executed." if len(combo) > 1 else "Key press executed."
|
||||||
|
return {"ok": True, "key": combo_text, "repeats": executed, "message": message}
|
||||||
|
|
||||||
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
|
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||||
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
|
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
|
||||||
@@ -647,6 +678,7 @@ class ScreenJobAgent:
|
|||||||
"text": (
|
"text": (
|
||||||
f"JOB: {job}\n"
|
f"JOB: {job}\n"
|
||||||
"You are in an action loop. Prefer execute_command for deterministic actions. "
|
"You are in an action loop. Prefer execute_command for deterministic actions. "
|
||||||
|
"For modifier shortcuts, use a single press_key combo (example: win+r). "
|
||||||
"You can return multiple tool calls in one step (example: click then sleep). "
|
"You can return multiple tool calls in one step (example: click then sleep). "
|
||||||
"When done call task_complete(return=..., data=...). "
|
"When done call task_complete(return=..., data=...). "
|
||||||
"Include useful structured output in data. "
|
"Include useful structured output in data. "
|
||||||
@@ -701,6 +733,7 @@ class ScreenJobAgent:
|
|||||||
"type": "input_text",
|
"type": "input_text",
|
||||||
"text": (
|
"text": (
|
||||||
"No function call was returned. Continue by using tools. "
|
"No function call was returned. Continue by using tools. "
|
||||||
|
"Use one press_key call for key combos like win+r. "
|
||||||
"You may call multiple tools in one step. "
|
"You may call multiple tools in one step. "
|
||||||
"When complete, call task_complete(return=..., data=...). "
|
"When complete, call task_complete(return=..., data=...). "
|
||||||
"Always finish with a final screen verification."
|
"Always finish with a final screen verification."
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ class _DummyPyAutoGUI:
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.last_move_to: tuple[int, int] | None = None
|
self.last_move_to: tuple[int, int] | None = None
|
||||||
self.last_click: tuple[int, int] | None = None
|
self.last_click: tuple[int, int] | None = None
|
||||||
|
self.last_hotkey: tuple[str, ...] | None = None
|
||||||
|
|
||||||
def screenshot(self) -> Image.Image:
|
def screenshot(self) -> Image.Image:
|
||||||
return Image.new("RGB", (1280, 720), color=(24, 24, 24))
|
return Image.new("RGB", (1280, 720), color=(24, 24, 24))
|
||||||
@@ -35,6 +36,9 @@ class _DummyPyAutoGUI:
|
|||||||
def press(self, _: str) -> None:
|
def press(self, _: str) -> None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def hotkey(self, *keys: str) -> None:
|
||||||
|
self.last_hotkey = tuple(keys)
|
||||||
|
|
||||||
|
|
||||||
def _build_agent(tmp_path: Path, monkeypatch) -> agent_module.ScreenJobAgent:
|
def _build_agent(tmp_path: Path, monkeypatch) -> agent_module.ScreenJobAgent:
|
||||||
dummy_gui = _DummyPyAutoGUI()
|
dummy_gui = _DummyPyAutoGUI()
|
||||||
@@ -89,3 +93,12 @@ def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None
|
|||||||
)
|
)
|
||||||
assert click_result["ok"] is True
|
assert click_result["ok"] is True
|
||||||
assert click_result["clicked"] == {"x": 110, "y": 102}
|
assert click_result["clicked"] == {"x": 110, "y": 102}
|
||||||
|
|
||||||
|
|
||||||
|
def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None:
|
||||||
|
agent = _build_agent(tmp_path, monkeypatch)
|
||||||
|
result = agent._tool_press_key({"key": "meta+r"})
|
||||||
|
assert result["ok"] is True
|
||||||
|
assert result["key"] == "win+r"
|
||||||
|
assert result["message"] == "Key combo executed."
|
||||||
|
assert agent_module.pyautogui.last_hotkey == ("win", "r")
|
||||||
|
|||||||
Reference in New Issue
Block a user