feat: support key combinations in press_key function and update related tests
This commit is contained in:
43
src/agent.py
43
src/agent.py
@@ -36,6 +36,7 @@ Rules:
|
||||
4) Coordinates are absolute screen pixels (x, y) from top-left.
|
||||
5) Use enhance(coordinate) when text/UI is unclear.
|
||||
6) For keyboard-heavy interactions, prefer press_key for special keys.
|
||||
6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls.
|
||||
7) You may call multiple tools in one step. If needed, do click then sleep.
|
||||
8) Never spam repeated clicks on the same coordinate; switch strategy.
|
||||
9) Keep tool arguments valid JSON and concise.
|
||||
@@ -223,7 +224,10 @@ class ScreenJobAgent:
|
||||
{
|
||||
"type": "function",
|
||||
"name": "press_key",
|
||||
"description": "Press a specific key (enter, tab, esc, arrows, etc).",
|
||||
"description": (
|
||||
"Press a key or key combo. "
|
||||
"For combos use plus syntax in one call (examples: 'win+r', 'ctrl+shift+esc')."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -483,20 +487,47 @@ class ScreenJobAgent:
|
||||
time.sleep(self.options.type_interval)
|
||||
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
|
||||
|
||||
def _normalize_key_name(self, key: str) -> str:
|
||||
normalized = key.strip().lower()
|
||||
aliases = {
|
||||
"windows": "win",
|
||||
"windowskey": "win",
|
||||
"meta": "win",
|
||||
"super": "win",
|
||||
"cmd": "command",
|
||||
}
|
||||
return aliases.get(normalized, normalized)
|
||||
|
||||
def _parse_key_combo(self, raw: str) -> list[str]:
|
||||
combo = raw.replace(" ", "")
|
||||
if not combo:
|
||||
return []
|
||||
parts = [self._normalize_key_name(part) for part in combo.split("+") if part.strip()]
|
||||
return parts
|
||||
|
||||
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
key = str(args.get("key", "")).strip().lower()
|
||||
repeats = max(1, int(args.get("repeats", 1)))
|
||||
if not key:
|
||||
return {"ok": False, "error": "Missing key."}
|
||||
repeats = min(repeats, 50)
|
||||
pressed = 0
|
||||
combo = self._parse_key_combo(key)
|
||||
if not combo:
|
||||
return {"ok": False, "error": "Invalid key."}
|
||||
|
||||
executed = 0
|
||||
for _ in range(repeats):
|
||||
if self._is_cancelled():
|
||||
break
|
||||
pyautogui.press(key)
|
||||
pressed += 1
|
||||
if len(combo) == 1:
|
||||
pyautogui.press(combo[0])
|
||||
else:
|
||||
pyautogui.hotkey(*combo)
|
||||
executed += 1
|
||||
time.sleep(0.03)
|
||||
return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}
|
||||
combo_text = "+".join(combo)
|
||||
message = "Key combo executed." if len(combo) > 1 else "Key press executed."
|
||||
return {"ok": True, "key": combo_text, "repeats": executed, "message": message}
|
||||
|
||||
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
|
||||
@@ -647,6 +678,7 @@ class ScreenJobAgent:
|
||||
"text": (
|
||||
f"JOB: {job}\n"
|
||||
"You are in an action loop. Prefer execute_command for deterministic actions. "
|
||||
"For modifier shortcuts, use a single press_key combo (example: win+r). "
|
||||
"You can return multiple tool calls in one step (example: click then sleep). "
|
||||
"When done call task_complete(return=..., data=...). "
|
||||
"Include useful structured output in data. "
|
||||
@@ -701,6 +733,7 @@ class ScreenJobAgent:
|
||||
"type": "input_text",
|
||||
"text": (
|
||||
"No function call was returned. Continue by using tools. "
|
||||
"Use one press_key call for key combos like win+r. "
|
||||
"You may call multiple tools in one step. "
|
||||
"When complete, call task_complete(return=..., data=...). "
|
||||
"Always finish with a final screen verification."
|
||||
|
||||
Reference in New Issue
Block a user