feat: support key combinations in press_key function and update related tests

This commit is contained in:
Space-Banane
2026-05-27 21:20:03 +02:00
parent 278f011a6d
commit 48a145d147
4 changed files with 69 additions and 8 deletions

View File

@@ -36,6 +36,7 @@ Rules:
4) Coordinates are absolute screen pixels (x, y) from top-left.
5) Use enhance(coordinate) when text/UI is unclear.
6) For keyboard-heavy interactions, prefer press_key for special keys.
6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls.
7) You may call multiple tools in one step. If needed, do click then sleep.
8) Never spam repeated clicks on the same coordinate; switch strategy.
9) Keep tool arguments valid JSON and concise.
@@ -223,7 +224,10 @@ class ScreenJobAgent:
{
"type": "function",
"name": "press_key",
"description": "Press a specific key (enter, tab, esc, arrows, etc).",
"description": (
"Press a key or key combo. "
"For combos use plus syntax in one call (examples: 'win+r', 'ctrl+shift+esc')."
),
"parameters": {
"type": "object",
"properties": {
@@ -483,20 +487,47 @@ class ScreenJobAgent:
time.sleep(self.options.type_interval)
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
def _normalize_key_name(self, key: str) -> str:
normalized = key.strip().lower()
aliases = {
"windows": "win",
"windowskey": "win",
"meta": "win",
"super": "win",
"cmd": "command",
}
return aliases.get(normalized, normalized)
def _parse_key_combo(self, raw: str) -> list[str]:
combo = raw.replace(" ", "")
if not combo:
return []
parts = [self._normalize_key_name(part) for part in combo.split("+") if part.strip()]
return parts
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
key = str(args.get("key", "")).strip().lower()
repeats = max(1, int(args.get("repeats", 1)))
if not key:
return {"ok": False, "error": "Missing key."}
repeats = min(repeats, 50)
pressed = 0
combo = self._parse_key_combo(key)
if not combo:
return {"ok": False, "error": "Invalid key."}
executed = 0
for _ in range(repeats):
if self._is_cancelled():
break
pyautogui.press(key)
pressed += 1
if len(combo) == 1:
pyautogui.press(combo[0])
else:
pyautogui.hotkey(*combo)
executed += 1
time.sleep(0.03)
return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}
combo_text = "+".join(combo)
message = "Key combo executed." if len(combo) > 1 else "Key press executed."
return {"ok": True, "key": combo_text, "repeats": executed, "message": message}
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
@@ -647,6 +678,7 @@ class ScreenJobAgent:
"text": (
f"JOB: {job}\n"
"You are in an action loop. Prefer execute_command for deterministic actions. "
"For modifier shortcuts, use a single press_key combo (example: win+r). "
"You can return multiple tool calls in one step (example: click then sleep). "
"When done call task_complete(return=..., data=...). "
"Include useful structured output in data. "
@@ -701,6 +733,7 @@ class ScreenJobAgent:
"type": "input_text",
"text": (
"No function call was returned. Continue by using tools. "
"Use one press_key call for key combos like win+r. "
"You may call multiple tools in one step. "
"When complete, call task_complete(return=..., data=...). "
"Always finish with a final screen verification."