diff --git a/SKILL.md b/SKILL.md index 1f89df8..84a97fa 100644 --- a/SKILL.md +++ b/SKILL.md @@ -6,8 +6,10 @@ ScreenJob lets an agent execute tasks that require a real desktop UI plus termin ## Main Features +- Hybrid control model: screenshot grounding plus Windows-native window/dialog/element helpers when available - Screen perception (`see_screen`, `enhance`) - Mouse/keyboard control (`click`, `type`, `press_key`) +- Native window/dialog control (`list_windows`, `find_window`, `focus_window`, `detect_dialog`, `dialog_action`, `dialog_set_filename`, `list_ui_elements`) - Terminal execution (`execute_command`, `sleep`) - Structured completion payload (`task_complete(return=..., data=...)`) - Safety gate, auth, history, and live monitoring @@ -45,6 +47,12 @@ Enhance-first click rule: - Optional zoom control: set `scale` from `2` to `6` (defaults are tuned by region). - After checking the enhanced image, click using the same target coordinate (or a small directional offset if needed). +Windows-native routing rule: + +- First classify whether the current surface is a normal app window, browser window, `#32770` dialog, Explorer file picker, or another system surface. +- Prefer native window/dialog/element tools for focus changes, save/open dialogs, modal confirmations, and exposed controls. +- Fall back to screenshots plus mouse/keyboard only when native automation is unavailable or the UI is custom-drawn. + Verification rule: - Before `task_complete`, verify actual on-screen content matches the expected outcome. diff --git a/src/agent.py b/src/agent.py index 3f4aabf..8fe3c6e 100644 --- a/src/agent.py +++ b/src/agent.py @@ -1,11 +1,16 @@ from __future__ import annotations +import ctypes +import hashlib import json import logging +import os +import re import subprocess import threading import time import traceback +from datetime import datetime from typing import Any, Callable from openai import OpenAI @@ -24,29 +29,325 @@ else: _PYAUTOGUI_IMPORT_ERROR = None -SYSTEM_PROMPT = """ -You are ScreenJob, an autonomous desktop-and-terminal task executor. - -Rules: +CORE_OPERATING_DOCTRINE = """ +Core operating doctrine: 1) Use tools to act. Do not claim actions without tool calls. -2) Prefer execute_command for deterministic actions: - - opening URLs/websites (Windows: start https://amazon.de) - - launching apps or running terminal checks -3) For UI tasks, inspect with see_screen before clicking/typing. -4) Coordinates are absolute screen pixels (x, y) from top-left. -5) Use enhance before risky clicks: small buttons/icons, dense UI, or when target confidence is below high. -5a) For tiny controls use enhance(coordinate, region="small", mode="ui"). For tiny text use mode="text". -6) For keyboard-heavy interactions, prefer press_key for special keys. -6a) For key combinations, call press_key once with combo syntax (example: "win+r", "ctrl+shift+esc"). Do not split modifier combos across separate calls. -7) You may call multiple tools in one step. If needed, do click then sleep. -8) Never spam repeated clicks on the same coordinate; switch strategy. -9) Keep tool arguments valid JSON and concise. -10) When objective is fully complete, call task_complete(return="...", data=...). -11) The "data" field should contain structured output useful for the requester (for example command output text). -12) Before finishing, verify actual screen content matches the expected outcome. -13) For verification, call see_screen (and enhance if needed), then include a concise observed_result in data. +2) Maintain a live mental model of the current surface kind, foreground app identity, likely focus, pending dialog state, browser workflow state when relevant, pointer state when relevant, and the observed outcome of the last action. +3) Work in this loop: classify -> choose control channel -> execute one meaningful transition -> verify. +4) First classify, then act. Classify the current UI into window/app/dialog/browser/system surface before choosing the next tool family. +5) Tool precedence: native window/dialog/element tool when confidence is high; execute_command for deterministic launches/checks; see_screen and enhance for grounding; raw mouse/keyboard fallback only when native routes are unavailable or weaker. +6) Verify before risky action. Re-check after state-changing or ambiguous action. If uncertain, observe again. +7) Keep tool arguments valid JSON and concise. """ +WINDOWS_ENVIRONMENT_RULES = """ +Windows environment rules: +1) Treat command-launched apps or URLs as background until get_active_window, wait_for_focus_change, wait_for_window, or see_screen proves focus changed. +2) Use get_active_window, list_windows, find_window, focus_window, detect_dialog, and wait helpers to reason about focus transfer, modal ownership, background launches, Start menu/taskbar surfaces, Explorer pickers, and hidden windows. +3) Recognize common Windows surfaces explicitly: normal top-level app window, #32770 modal dialog, Explorer open/save picker, taskbar/start menu, context menu, toast notification, browser window chrome, browser page content, permission prompt, and secure desktop/UAC interruption. +4) If focus may be wrong, verify with get_active_window or a native wait/focus helper before typing or clicking. +5) If pointer state matters, call get_cursor_position before move_mouse or drag. Use move_mouse and drag only after confirming the intended region. +6) Prefer non-visual verification when available: clipboard_get for copy/cut, get_active_window or window tools for focus/open/close changes, element tools for enabled/visible state, and command stdout for deterministic shell actions. +""" + +BROWSER_WORKFLOW_RULES = """ +Browser workflow rules: +1) Distinguish browser chrome from page content. Address bar, tab strip, downloads controls, permission prompts, and file dialogs are browser or system surfaces, not page content. +2) For browser-heavy tasks, prefer native window/dialog/element operations for focus, tab/window selection, file upload/download dialogs, and browser-owned confirmation surfaces when exposed. +3) Use screenshots for page-content grounding and for custom-drawn browser surfaces that native tooling cannot read. +4) For upload/download flows, check first for dialog or browser-owned downloads UI before repeating pixel clicks. +""" + +DIALOG_HANDLING_RULES = """ +Dialog-handling rules: +1) If an unexpected modal or confirmation dialog appears, pause the prior plan and resolve or dismiss the modal first. +2) When a save/open/confirm dialog is likely, prefer detect_dialog, dialog_set_filename, dialog_action, wait_for_window, wait_for_dialog_close, list_ui_elements, and set_ui_element_value before coordinate clicking. +3) Common dialog actions include Open, Save, Cancel, OK, Yes, and No. Do not assume the dialog is gone until verified. +4) If secure desktop or UAC is detected or strongly suspected, report a blocked state explicitly instead of blind retries. +5) If a control is disabled, hidden, or not exposed natively, fall back to screenshot reasoning and raw input only after recording that fallback. +""" + +COMPLETION_VERIFICATION_RULES = """ +Completion and verification rules: +1) Use see_screen at a balanced cadence: before a new UI phase, after navigation or layout changes, after actions that could fail silently, and before task_complete. Do not spam screenshots after every deterministic micro-step. +2) Treat enhance as the default follow-up when a target is small, dense, visually similar, or text-heavy. For tiny controls use enhance(coordinate, region="small", mode="ui"). For tiny text use mode="text". +3) Never infer success from intent. Never spam repeated clicks on the same coordinate; switch strategy and re-observe. +4) Do not invent new subgoals. Unless the objective explicitly asks for it, do not search for saved files, browse the filesystem, reopen apps, or otherwise expand the task boundary after the requested result is already achieved. +5) After copy, paste, save, close, upload, download, or navigation milestones, prefer verify-and-finish over extra exploration. +6) When objective is fully complete, call task_complete(return="...", data=...). +7) Before task_complete, do a fresh verification pass with see_screen and add enhance when the proof is small, dense, or text-heavy. +8) In task_complete data, include explicit verification text such as data.observed_result describing what you just observed. +9) If runtime evidence says the objective is likely already satisfied, do one fresh verification pass and then call task_complete unless you observe a concrete contradiction. +""" + + +def _compose_prompt(*sections: str) -> str: + return "\n\n".join(section.strip() for section in sections if section and section.strip()) + + +SYSTEM_PROMPT = _compose_prompt( + "You are ScreenJob, an autonomous Windows desktop-and-terminal task executor.", + CORE_OPERATING_DOCTRINE, + WINDOWS_ENVIRONMENT_RULES, + BROWSER_WORKFLOW_RULES, + DIALOG_HANDLING_RULES, + COMPLETION_VERIFICATION_RULES, +) + + +def _prohibited_key_combo_prompt(prohibited_key_combos: list[str] | tuple[str, ...] | set[str] | None = None) -> str: + combos = sorted({str(combo).strip() for combo in (prohibited_key_combos or []) if str(combo).strip()}) + if not combos: + return "" + return ( + "Prohibited key combos for this run: " + + ", ".join(combos) + + ". Do not use them; choose another allowed route.\n" + ) + + +def build_initial_action_prompt( + job: str, + prohibited_key_combos: list[str] | tuple[str, ...] | set[str] | None = None, +) -> str: + return _compose_prompt( + f"JOB: {job}\n" + "You are in an action loop.\n" + "First classify the current UI state from the latest evidence.\n" + "Identify what changed since the last action or screen capture.\n" + "Choose the next action only after confirming the target, likely focus, likely dialog/browser state, and likely result.\n" + "Use classify -> choose control channel -> execute one meaningful transition -> verify.\n" + "Prefer native window/dialog/element tools when they fit the current surface; use screenshots and raw pointer input as fallback.\n" + "Prefer execute_command for deterministic actions.\n" + "Treat command-launched apps or URLs as background until get_active_window, wait_for_focus_change, wait_for_window, or see_screen proves they took focus.\n" + "For modifier shortcuts, use a single press_key combo (example: win+r).\n" + f"{_prohibited_key_combo_prompt(prohibited_key_combos)}" + "Explicitly watch for #32770 dialogs, Explorer open/save pickers, browser download/upload flows, taskbar/start menu focus traps, context menus, disabled controls, and permission prompts.\n" + "If focus or the foreground app may be wrong, call get_active_window, find_window, focus_window, or a focus wait helper.\n" + "If an unexpected modal appears, pause the prior plan and resolve the modal first.\n" + "When a fresh focus check or a clear retained visual already proves the target editor or field is ready, act directly; do not re-capture the screen just to reconfirm an obvious large input area.\n" + "You may use more than one tool in one step when that improves certainty, such as get_active_window plus detect_dialog, see_screen then enhance, or click then see_screen.\n" + "When done, do a fresh verification pass with see_screen and add enhance if the proof is small or text-heavy.\n" + "Then call task_complete(return=..., data={\"observed_result\": ...}).\n" + "Include useful structured output in data.", + CORE_OPERATING_DOCTRINE, + WINDOWS_ENVIRONMENT_RULES, + BROWSER_WORKFLOW_RULES, + DIALOG_HANDLING_RULES, + COMPLETION_VERIFICATION_RULES, + ) + + +def build_no_tool_prompt(prohibited_key_combos: list[str] | tuple[str, ...] | set[str] | None = None) -> str: + return _compose_prompt( + "No function call was returned. Recover by re-observing the current desktop state instead of guessing.\n" + "Start by classifying the surface. Use get_active_window, detect_dialog, find_window, see_screen, enhance, get_cursor_position, or clipboard_get according to what is missing.\n" + "Rebuild the live mental model, identify what changed, and choose the next safe action only after confirming the target and likely result.\n" + "Route toward native window/dialog/element tools before repeating raw clicks on Windows surfaces.\n" + f"{_prohibited_key_combo_prompt(prohibited_key_combos)}" + "Do not assume execute_command launches changed the foreground window; verify focus before typing.\n" + "If a modal, picker, or browser download/upload surface is likely, resolve that first.\n" + "Before task_complete, do a fresh verification pass with see_screen, add enhance if needed, and include data.observed_result.", + CORE_OPERATING_DOCTRINE, + WINDOWS_ENVIRONMENT_RULES, + DIALOG_HANDLING_RULES, + COMPLETION_VERIFICATION_RULES, + ) + + +def build_context_compaction_prompt( + objective: str, + step: int, + recent_tool_summaries: list[str], + rebuild_reason: str, + recent_visual_summaries: list[str] | None = None, + prohibited_key_combos: list[str] | tuple[str, ...] | set[str] | None = None, +) -> str: + recent = recent_tool_summaries[-8:] + lines = "\n".join(f"- {line}" for line in recent) if recent else "- No recent tool activity." + visual_recent = recent_visual_summaries or [] + visual_lines = "\n".join(f"- {line}" for line in visual_recent) if visual_recent else "- No retained visual observations." + reason_text = { + "decay": "stale context decay", + "visual_budget": "visual-context budget overflow", + }.get(rebuild_reason, rebuild_reason.replace("_", " ")) + return _compose_prompt( + f"Context compaction activated due to {reason_text}.\n" + f"JOB: {objective}\n" + f"Current step: {step}\n" + "Recent tool activity:\n" + f"{lines}\n" + "Retained visual observations:\n" + f"{visual_lines}\n" + "Treat prior reasoning as stale, but do not throw away the retained visuals below.\n" + "The retained visuals already represent the latest image tool calls and their results. Reuse them first; do not call see_screen again only because compaction happened.\n" + "Determine the current surface kind, foreground app, likely focus, pending dialog/browser state, and what changed since the last action before acting.\n" + "Use classify -> choose control channel -> execute one meaningful transition -> verify.\n" + f"{_prohibited_key_combo_prompt(prohibited_key_combos)}" + "Re-observe with see_screen only when the retained visuals are stale or the UI likely changed. Add enhance for small or text-heavy details, and use get_active_window, detect_dialog, wait helpers, clipboard_get, or command stdout when they are the better verification channel.\n" + "If a fresh focus check or retained visual already proves a text field or editor is ready, act without demanding another screenshot.\n" + "Treat execute_command app or URL launches as background until focus is explicitly verified.\n" + "Use tools only. Finish only after a fresh verification pass with see_screen and explicit data.observed_result in task_complete.", + CORE_OPERATING_DOCTRINE, + WINDOWS_ENVIRONMENT_RULES, + BROWSER_WORKFLOW_RULES, + DIALOG_HANDLING_RULES, + COMPLETION_VERIFICATION_RULES, + ) + + +def build_blocked_action_prompt( + tool_name: str, + hint: str | None = None, + prohibited_key_combos: list[str] | tuple[str, ...] | set[str] | None = None, +) -> str: + extra = f"\n{hint.strip()}" if hint and hint.strip() else "" + return _compose_prompt( + f"The last action using {tool_name} was blocked or unreliable. Do not retry blindly.\n" + "Re-anchor on the live desktop state first: classify the current surface, then choose the best tool family.\n" + "If this looks like a dialog, picker, permission prompt, or browser-owned confirmation surface, route to detect_dialog, dialog_action, dialog_set_filename, wait_for_dialog_close, list_ui_elements, or focus/window tools before pixel retries.\n" + "If focus or the foreground app may be wrong, call get_active_window, find_window, or focus_window.\n" + "If pointer placement matters, call get_cursor_position before move_mouse or drag.\n" + "If copy success matters, call clipboard_get instead of assuming it worked.\n" + "If execute_command launched an app or URL, do not assume it is foreground until get_active_window, wait_for_focus_change, wait_for_window, or see_screen confirms it.\n" + f"{_prohibited_key_combo_prompt(prohibited_key_combos)}" + "If secure desktop or UAC is suspected, stop blind retries and report the blocked state explicitly.\n" + "Switch strategy after the fresh classification: native control instead of pixels, keyboard instead of mouse, mouse instead of keyboard, commands instead of UI, UI instead of commands, or finish if the job is already done.\n" + f"Confirm what changed before choosing a new action. Use classify -> choose control channel -> execute one meaningful transition -> verify.{extra}", + CORE_OPERATING_DOCTRINE, + WINDOWS_ENVIRONMENT_RULES, + BROWSER_WORKFLOW_RULES, + DIALOG_HANDLING_RULES, + COMPLETION_VERIFICATION_RULES, + ) + + +def build_observation_loop_prompt( + window_summary: str | None = None, + repeated_steps: int = 3, + prohibited_key_combos: list[str] | tuple[str, ...] | set[str] | None = None, +) -> str: + summary_text = f" Current foreground window: {window_summary}." if window_summary else "" + return _compose_prompt( + f"You have already re-observed the same stable window for {repeated_steps} step(s) without making progress." + f"{summary_text}\n" + "Do not keep calling broad observation tools like see_screen or get_active_window on the same unchanged state.\n" + "Change method now: use a native window/dialog/element tool for this surface, interact with the visible control, resolve the modal, or finish if you already have proof.\n" + f"{_prohibited_key_combo_prompt(prohibited_key_combos)}" + "Use enhance only if a small or text-heavy control must be read before acting.\n" + "If this window is a #32770 dialog or picker, act on the dialog instead of re-scanning the whole screen.\n" + "Use classify -> choose control channel -> execute one meaningful transition -> verify.", + CORE_OPERATING_DOCTRINE, + WINDOWS_ENVIRONMENT_RULES, + DIALOG_HANDLING_RULES, + COMPLETION_VERIFICATION_RULES, + ) + + +def build_finish_likely_prompt( + evidence_summary: str, + *, + verification_done: bool = False, + prohibited_key_combos: list[str] | tuple[str, ...] | set[str] | None = None, +) -> str: + if verification_done: + return _compose_prompt( + "Runtime completion evidence still indicates the objective is satisfied.\n" + f"Evidence: {evidence_summary}\n" + "You already have a fresh post-completion verification pass.\n" + f"{_prohibited_key_combo_prompt(prohibited_key_combos)}" + "Do not reopen menus, repeat save/export/download actions, or re-search the filesystem unless a new contradiction appears.\n" + "Call task_complete now with a concise return string and explicit data.observed_result from the latest verification.", + COMPLETION_VERIFICATION_RULES, + ) + return _compose_prompt( + "Runtime completion evidence indicates the objective is likely already satisfied.\n" + f"Evidence: {evidence_summary}\n" + "Do one fresh verification pass now: call see_screen, add enhance only if the proof is small or text-heavy, then call task_complete.\n" + f"{_prohibited_key_combo_prompt(prohibited_key_combos)}" + "Do not reopen menus, repeat save/export/download actions, or re-search the filesystem unless a new contradiction appears.", + COMPLETION_VERIFICATION_RULES, + ) + + +ALL_TOOL_NAMES: tuple[str, ...] = ( + "task_complete", + "execute_command", + "sleep", + "see_screen", + "enhance", + "list_windows", + "find_window", + "focus_window", + "close_window", + "wait_for_window", + "wait_for_focus_change", + "detect_dialog", + "dialog_action", + "dialog_set_filename", + "wait_for_dialog_close", + "list_ui_elements", + "invoke_ui_element", + "set_ui_element_value", + "select_ui_element", + "wait_for_ui_element", + "type", + "press_key", + "click", + "scroll", + "drag", + "move_mouse", + "clipboard_get", + "clipboard_set", + "get_cursor_position", + "get_active_window", +) +PROTECTED_TOOL_NAMES = {"task_complete"} +VISUAL_TOOL_NAMES = {"see_screen", "enhance"} +WINDOW_TOOL_NAMES = { + "list_windows", + "find_window", + "focus_window", + "close_window", + "wait_for_window", + "wait_for_focus_change", + "get_active_window", +} +DIALOG_TOOL_NAMES = { + "detect_dialog", + "dialog_action", + "dialog_set_filename", + "wait_for_dialog_close", +} +UI_ELEMENT_TOOL_NAMES = { + "list_ui_elements", + "invoke_ui_element", + "set_ui_element_value", + "select_ui_element", + "wait_for_ui_element", +} +OBSERVATION_TOOL_NAMES = VISUAL_TOOL_NAMES | WINDOW_TOOL_NAMES | DIALOG_TOOL_NAMES | UI_ELEMENT_TOOL_NAMES | { + "clipboard_get", + "get_cursor_position", +} +MAX_ACTION_SIGNATURE_ATTEMPTS = 3 +MAX_STABLE_OBSERVATION_STEPS = 3 +FINISH_LIKELY_OBSERVATION_TOOLS = {"see_screen", "enhance", "get_active_window", "detect_dialog"} + + +def normalize_disabled_tools(tool_names: set[str] | list[str] | tuple[str, ...] | None) -> list[str]: + normalized = sorted({str(tool).strip().lower() for tool in (tool_names or []) if str(tool).strip()}) + invalid = [name for name in normalized if name not in ALL_TOOL_NAMES] + if invalid: + invalid_text = ", ".join(invalid) + allowed_text = ", ".join(ALL_TOOL_NAMES) + raise ValueError(f"Unknown disabled tool(s): {invalid_text}. Allowed tools: {allowed_text}.") + blocked = [name for name in normalized if name in PROTECTED_TOOL_NAMES] + if blocked: + blocked_text = ", ".join(blocked) + raise ValueError(f"Cannot disable required tool(s): {blocked_text}.") + return normalized + class ScreenJobAgent: def __init__( @@ -81,10 +382,38 @@ class ScreenJobAgent: self.last_screen_data_url: str | None = None self.last_screen_meta: dict[str, Any] | None = None + self.last_visual_signature: str | None = None self.click_history: list[tuple[int, int, float]] = [] - self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()} + self.disabled_tools = set(normalize_disabled_tools(options.disable_tools)) self.recent_tool_summaries: list[str] = [] self.last_context_compact_step = 0 + self.visual_context_messages: list[dict[str, Any]] = [] + self.visual_context_overflow_pending = False + self.visual_context_sequence = 0 + self.action_gate_state: dict[str, dict[str, Any]] = {} + self.step_history: list[dict[str, Any]] = [] + self.prohibited_key_combos = self._normalize_prohibited_key_combos(options.prohibited_key_combos) + self.last_observed_window: dict[str, Any] | None = None + self.last_surface_state: dict[str, Any] = { + "surface_kind": "unknown", + "dialog_kind": "none", + "active_app": "", + "browser_workflow_state": "", + "target": None, + "confidence": 0.0, + } + self.surface_retry_state: dict[str, dict[str, Any]] = {} + self.last_native_targets: dict[str, dict[str, Any]] = {} + self.completion_evidence: dict[str, dict[str, Any]] = {} + self.finish_likely_state: dict[str, Any] = { + "active": False, + "activated_at_step": 0, + "target_filename": "", + "summary": "", + "fresh_verification_done": False, + "verification_step": 0, + "post_completion_visual_signature": "", + } def _emit(self, event_type: str, payload: dict[str, Any]) -> None: if self.event_callback is None: @@ -135,11 +464,18 @@ class ScreenJobAgent: ) def _tool_schemas(self) -> list[dict[str, Any]]: + prohibited_combo_text = "" + if self.prohibited_key_combos: + prohibited_combo_text = " Prohibited for this run: " + ", ".join(self._sorted_prohibited_key_combos()) + "." all_tools: list[dict[str, Any]] = [ { "type": "function", "name": "task_complete", - "description": "Call this when the job objective is fully done.", + "description": ( + "Call this only when the job objective is fully done and freshly verified. " + "Before finishing, call see_screen and add enhance if the proof is small or text-heavy. " + "Include explicit verification text such as data.observed_result." + ), "parameters": { "type": "object", "properties": { @@ -156,7 +492,9 @@ class ScreenJobAgent: "name": "execute_command", "description": ( "Run a shell command and return stdout/stderr/exit code. " - "Prefer this for deterministic operations like opening URLs." + "Prefer this for deterministic operations like opening URLs. " + "Do not assume a launched app or URL took foreground focus until get_active_window or see_screen confirms it. " + "Do not use recursive file-search or reveal commands unless the objective explicitly asks to locate an output." ), "parameters": { "type": "object", @@ -186,7 +524,10 @@ class ScreenJobAgent: { "type": "function", "name": "see_screen", - "description": "Capture full screen with coordinate grid overlay.", + "description": ( + "Capture full screen with coordinate grid overlay. " + "Use before a new UI phase, after state changes, when uncertain, and before task_complete." + ), "parameters": { "type": "object", "properties": {}, @@ -198,7 +539,7 @@ class ScreenJobAgent: "name": "enhance", "description": ( "Create enhanced zoom around a coordinate for readability and precise targeting. " - "Prefer this before clicking tiny or ambiguous UI targets." + "Prefer this for tiny, dense, visually similar, or text-heavy targets." ), "parameters": { "type": "object", @@ -229,6 +570,246 @@ class ScreenJobAgent: "additionalProperties": False, }, }, + { + "type": "function", + "name": "list_windows", + "description": ( + "List visible top-level windows with Windows-native metadata including executable name, dialog classification, " + "and focus-relevant state. Prefer this over pixel guesses when deciding which app or dialog is present." + ), + "parameters": { + "type": "object", + "properties": { + "visible_only": {"type": "boolean"}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "find_window", + "description": ( + "Find one top-level window by handle, title substring, class name, or executable name. " + "Use before focus changes or dialog routing." + ), + "parameters": { + "type": "object", + "properties": { + "hwnd": {"type": ["integer", "string"]}, + "title_contains": {"type": "string"}, + "class_name": {"type": "string"}, + "executable_name": {"type": "string"}, + "visible_only": {"type": "boolean"}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "focus_window", + "description": ( + "Bring a target window to the foreground using native focus APIs. " + "Prefer this over raw Alt+Tab loops when the target window is known." + ), + "parameters": { + "type": "object", + "properties": { + "hwnd": {"type": ["integer", "string"]}, + "title_contains": {"type": "string"}, + "class_name": {"type": "string"}, + "executable_name": {"type": "string"}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "close_window", + "description": "Close a target top-level window with WM_CLOSE when the objective requires dismissing it.", + "parameters": { + "type": "object", + "properties": { + "hwnd": {"type": ["integer", "string"]}, + "title_contains": {"type": "string"}, + "class_name": {"type": "string"}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "wait_for_window", + "description": "Wait for a top-level window matching title/class/executable criteria to appear.", + "parameters": { + "type": "object", + "properties": { + "title_contains": {"type": "string"}, + "class_name": {"type": "string"}, + "executable_name": {"type": "string"}, + "timeout_seconds": {"type": ["number", "string"]}, + "visible_only": {"type": "boolean"}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "wait_for_focus_change", + "description": ( + "Wait for foreground focus to change, optionally toward a matching target window. " + "Use this after launches, tab switches, or focus transfers." + ), + "parameters": { + "type": "object", + "properties": { + "title_contains": {"type": "string"}, + "class_name": {"type": "string"}, + "executable_name": {"type": "string"}, + "timeout_seconds": {"type": ["number", "string"]}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "detect_dialog", + "description": ( + "Detect the active or visible dialog and classify it as save/open/confirmation/permission/modal. " + "Prefer this when a #32770 or picker surface is likely." + ), + "parameters": { + "type": "object", + "properties": { + "title_contains": {"type": "string"}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "dialog_action", + "description": ( + "Invoke a common dialog button such as Open, Save, Cancel, OK, Yes, No, or Close through native controls." + ), + "parameters": { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["open", "save", "cancel", "ok", "yes", "no", "close"], + }, + "hwnd": {"type": ["integer", "string"]}, + }, + "required": ["action"], + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "dialog_set_filename", + "description": "Set the filename or path field in a native open/save dialog without coordinate clicking.", + "parameters": { + "type": "object", + "properties": { + "filename": {"type": "string"}, + "hwnd": {"type": ["integer", "string"]}, + }, + "required": ["filename"], + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "wait_for_dialog_close", + "description": "Wait until the current or specified dialog is no longer visible.", + "parameters": { + "type": "object", + "properties": { + "hwnd": {"type": ["integer", "string"]}, + "timeout_seconds": {"type": ["number", "string"]}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "list_ui_elements", + "description": ( + "List visible native child controls for the active window or a specific target window. " + "Use this for precise dialog or window interaction when controls are exposed." + ), + "parameters": { + "type": "object", + "properties": { + "window": {"type": "object"}, + "scope": {"type": "string", "enum": ["active_window", "dialog"]}, + "include_hidden": {"type": "boolean"}, + "text_contains": {"type": "string"}, + "class_name": {"type": "string"}, + "role": {"type": "string"}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "invoke_ui_element", + "description": "Invoke a native button or exposed control by returned target handle.", + "parameters": { + "type": "object", + "properties": { + "element": {"type": "object"}, + }, + "required": ["element"], + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "set_ui_element_value", + "description": "Set text on an exposed native edit control by returned target handle.", + "parameters": { + "type": "object", + "properties": { + "element": {"type": "object"}, + "text": {"type": "string"}, + }, + "required": ["element", "text"], + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "select_ui_element", + "description": "Select a native combo-box or list-box item by text or index when supported.", + "parameters": { + "type": "object", + "properties": { + "element": {"type": "object"}, + "text": {"type": "string"}, + "index": {"type": ["integer", "string"]}, + }, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "wait_for_ui_element", + "description": "Wait until a native child control matching filters appears in the chosen window or dialog.", + "parameters": { + "type": "object", + "properties": { + "window": {"type": "object"}, + "scope": {"type": "string", "enum": ["active_window", "dialog"]}, + "text_contains": {"type": "string"}, + "class_name": {"type": "string"}, + "role": {"type": "string"}, + "include_hidden": {"type": "boolean"}, + "timeout_seconds": {"type": ["number", "string"]}, + }, + "additionalProperties": False, + }, + }, { "type": "function", "name": "type", @@ -246,6 +827,7 @@ class ScreenJobAgent: "description": ( "Press a key or key combo. " "For combos use plus syntax in one call (examples: 'win+r', 'ctrl+shift+esc')." + + prohibited_combo_text ), "parameters": { "type": "object", @@ -261,7 +843,8 @@ class ScreenJobAgent: "type": "function", "name": "click", "description": ( - "Click absolute screen coordinate with simple directional offsets. " + "Click absolute screen coordinate with optional button selection, multi-click count, " + "and simple directional offsets. " "Use offset_up/down/left/right values like 2 or '2px'. " "Optional sleep_after_seconds performs a pause immediately after click." ), @@ -290,15 +873,168 @@ class ScreenJobAgent: "offset_down": {"type": ["integer", "string"]}, "offset_left": {"type": ["integer", "string"]}, "offset_right": {"type": ["integer", "string"]}, + "button": {"type": "string", "enum": ["left", "middle", "right"]}, + "click_count": {"type": "integer", "minimum": 1, "maximum": 5}, + "interval_seconds": {"type": ["number", "string"]}, + "duration_seconds": {"type": ["number", "string"]}, "sleep_after_seconds": {"type": ["number", "string"]}, }, "required": ["coordinate"], "additionalProperties": False, }, }, + { + "type": "function", + "name": "scroll", + "description": ( + "Scroll vertically. Positive amount scrolls up and negative amount scrolls down. " + "Optional direction can force up/down semantics regardless of sign." + ), + "parameters": { + "type": "object", + "properties": { + "amount": {"type": ["integer", "string"]}, + "direction": {"type": "string", "enum": ["up", "down"]}, + "coordinate": { + "type": "object", + "properties": { + "x": {"type": "integer"}, + "y": {"type": "integer"}, + }, + "required": ["x", "y"], + "additionalProperties": False, + }, + "sleep_after_seconds": {"type": ["number", "string"]}, + }, + "required": ["amount"], + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "drag", + "description": ( + "Drag the mouse from one absolute screen coordinate to another. " + "Confirm the intended control region with see_screen or enhance before dragging." + ), + "parameters": { + "type": "object", + "properties": { + "start_coordinate": { + "type": "object", + "properties": { + "x": {"type": "integer"}, + "y": {"type": "integer"}, + }, + "required": ["x", "y"], + "additionalProperties": False, + }, + "end_coordinate": { + "type": "object", + "properties": { + "x": {"type": "integer"}, + "y": {"type": "integer"}, + }, + "required": ["x", "y"], + "additionalProperties": False, + }, + "button": {"type": "string", "enum": ["left", "middle", "right"]}, + "duration_seconds": {"type": ["number", "string"]}, + "sleep_after_seconds": {"type": ["number", "string"]}, + }, + "required": ["start_coordinate", "end_coordinate"], + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "move_mouse", + "description": ( + "Move the mouse pointer to an absolute screen coordinate without clicking. " + "Use after confirming the target region on screen." + ), + "parameters": { + "type": "object", + "properties": { + "coordinate": { + "type": "object", + "properties": { + "x": {"type": "integer"}, + "y": {"type": "integer"}, + }, + "required": ["x", "y"], + "additionalProperties": False, + }, + "duration_seconds": {"type": ["number", "string"]}, + }, + "required": ["coordinate"], + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "clipboard_get", + "description": ( + "Read clipboard state to verify copy or cut results. " + "Returns text plus lightweight format metadata without raw image bytes." + ), + "parameters": { + "type": "object", + "properties": {}, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "clipboard_set", + "description": "Write text to the Windows clipboard.", + "parameters": { + "type": "object", + "properties": { + "text": {"type": "string"}, + }, + "required": ["text"], + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "get_cursor_position", + "description": "Return the current absolute cursor position when pointer state matters.", + "parameters": { + "type": "object", + "properties": {}, + "additionalProperties": False, + }, + }, + { + "type": "function", + "name": "get_active_window", + "description": ( + "Return metadata for the current foreground window to verify focus, active app, surface kind, " + "browser state, and dialog classification." + ), + "parameters": { + "type": "object", + "properties": {}, + "additionalProperties": False, + }, + }, ] + optional_native_tools = WINDOW_TOOL_NAMES | DIALOG_TOOL_NAMES | UI_ELEMENT_TOOL_NAMES + if not self._native_control_tools_enabled(): + optional_native_tools = optional_native_tools - {"get_active_window"} + return [ + tool + for tool in all_tools + if tool["name"] not in self.disabled_tools and tool["name"] not in optional_native_tools + ] return [tool for tool in all_tools if tool["name"] not in self.disabled_tools] + def _compute_visual_signature(self, image: Image.Image) -> str: + preview = image.convert("L").resize((64, 64)) + return hashlib.sha1(preview.tobytes()).hexdigest()[:16] + def _capture_screen(self, with_grid: bool = True) -> tuple[Image.Image, dict[str, Any]]: screenshot = pyautogui.screenshot().convert("RGB") width, height = screenshot.size @@ -329,6 +1065,913 @@ class ScreenJobAgent: ], } + def _max_visual_context_images(self) -> int: + return max(0, int(self.options.max_visual_context_images or 0)) + + def _is_visual_input_message(self, item: dict[str, Any]) -> bool: + if item.get("role") != "user": + return False + content = item.get("content") + if not isinstance(content, list): + return False + return any(part.get("type") == "input_image" for part in content if isinstance(part, dict)) + + def _sorted_prohibited_key_combos(self) -> list[str]: + return sorted(self.prohibited_key_combos) + + def _native_automation_mode(self) -> str: + mode = str(self.options.native_automation_mode or "prefer").strip().lower() + if mode not in {"off", "prefer", "require_fallback"}: + return "prefer" + return mode + + def _native_control_tools_enabled(self) -> bool: + return self._native_automation_mode() != "off" + + def _active_app_identity(self, window: dict[str, Any] | None) -> str: + if not isinstance(window, dict): + return "" + executable_name = str(window.get("executable_name") or "").strip() + if executable_name: + return executable_name + title = str(window.get("title") or "").strip() + if title: + return title + class_name = str(window.get("class_name") or "").strip() + return class_name + + def _looks_like_browser_window(self, window: dict[str, Any] | None) -> bool: + if not isinstance(window, dict): + return False + executable_name = str(window.get("executable_name") or "").strip().lower() + class_name = str(window.get("class_name") or "").strip().lower() + title = str(window.get("title") or "").strip().lower() + browser_tokens = ("chrome", "msedge", "firefox", "iexplore", "opera", "brave") + class_tokens = ("chrome_widgetwin", "mozillawindowclass", "cabinetwclass") + return any(token in executable_name for token in browser_tokens) or any(token in class_name for token in class_tokens) or any( + marker in title for marker in (" - google chrome", " - microsoft edge", " - mozilla firefox", " - brave") + ) + + def _browser_workflow_state(self, window: dict[str, Any] | None) -> str: + if not self._looks_like_browser_window(window): + return "" + title = str((window or {}).get("title") or "").strip().lower() + class_name = str((window or {}).get("class_name") or "").strip().lower() + if any(token in title for token in ("save as", "open", "downloads", "download")): + return "browser_dialog_or_download" + if "chrome_widgetwin" in class_name or "mozillawindowclass" in class_name: + return "browser_chrome" + return "browser_window" + + def _dialog_kind_from_window(self, window: dict[str, Any] | None) -> str: + if not isinstance(window, dict) or not bool(window.get("available")): + return "none" + title = str(window.get("title") or "").strip().lower() + class_name = str(window.get("class_name") or "").strip() + if bool(window.get("secure_desktop")): + return "secure_desktop" + if class_name == "#32770": + if any(token in title for token in ("save", "save as", "save file")): + return "file_save" + if any(token in title for token in ("open", "choose file", "select file", "upload")): + return "file_open" + if any(token in title for token in ("confirm", "replace", "delete", "warning", "question", "are you sure")): + return "confirmation" + if any(token in title for token in ("permission", "administrator", "user account control")): + return "permission" + return "modal" + if any(token in title for token in ("user account control", "permission")): + return "permission" + return "none" + + def _surface_kind_from_window(self, window: dict[str, Any] | None) -> str: + if not isinstance(window, dict) or not bool(window.get("available")): + return "unknown" + dialog_kind = self._dialog_kind_from_window(window) + if dialog_kind == "secure_desktop": + return "secure_desktop" + if dialog_kind in {"file_open", "file_save"}: + return "file_dialog" + if dialog_kind in {"confirmation", "permission", "modal"}: + return "modal_dialog" + if self._looks_like_browser_window(window): + return "browser_window" + title = str(window.get("title") or "").strip().lower() + class_name = str(window.get("class_name") or "").strip().lower() + if "start" in title and "menu" in title: + return "system_surface" + if class_name in {"shell_traywnd", "button"} and "start" in title: + return "system_surface" + return "application_window" + + def _recommended_tools_for_surface(self, surface_kind: str, dialog_kind: str = "none") -> list[str]: + if surface_kind == "secure_desktop": + return ["task_complete"] + if surface_kind == "file_dialog": + return ["dialog_set_filename", "dialog_action", "wait_for_dialog_close", "list_ui_elements"] + if surface_kind == "modal_dialog": + return ["detect_dialog", "dialog_action", "list_ui_elements"] + if surface_kind == "browser_window": + return ["get_active_window", "detect_dialog", "see_screen", "enhance"] + if surface_kind == "system_surface": + return ["get_active_window", "see_screen", "enhance"] + if dialog_kind != "none": + return ["detect_dialog", "dialog_action"] + return ["get_active_window", "see_screen", "enhance"] + + def _build_target_handle( + self, + handle: int, + *, + target_type: str, + window_handle: int | None = None, + parent_handle: int | None = None, + control_id: int | None = None, + class_name: str = "", + text: str = "", + ) -> dict[str, Any]: + target = { + "type": target_type, + "handle": int(handle), + } + if window_handle is not None: + target["window_handle"] = int(window_handle) + if parent_handle is not None: + target["parent_handle"] = int(parent_handle) + if control_id is not None: + target["control_id"] = int(control_id) + if class_name: + target["class_name"] = class_name + if text: + target["text"] = text + return target + + def _update_surface_state( + self, + *, + window: dict[str, Any] | None = None, + target: dict[str, Any] | None = None, + confidence: float = 0.0, + source_tool: str = "", + ) -> dict[str, Any]: + active_app = self._active_app_identity(window) + surface_kind = self._surface_kind_from_window(window) + dialog_kind = self._dialog_kind_from_window(window) + browser_workflow_state = self._browser_workflow_state(window) + target_payload = dict(target) if isinstance(target, dict) else None + next_state = { + "surface_kind": surface_kind, + "dialog_kind": dialog_kind, + "active_app": active_app, + "browser_workflow_state": browser_workflow_state, + "target": target_payload, + "confidence": round(clamp(confidence, 0.0, 1.0), 3), + } + previous_signature = json.dumps(self.last_surface_state, sort_keys=True, ensure_ascii=False) + next_signature = json.dumps(next_state, sort_keys=True, ensure_ascii=False) + self.last_surface_state = next_state + if previous_signature != next_signature: + self._emit( + "surface_classified", + { + "source_tool": source_tool, + **next_state, + }, + ) + if dialog_kind != "none": + self._emit( + "dialog_detected", + { + "source_tool": source_tool, + "dialog_kind": dialog_kind, + "surface_kind": surface_kind, + "active_app": active_app, + "target": target_payload, + }, + ) + return next_state + + def _build_native_result( + self, + *, + ok: bool, + message: str, + window: dict[str, Any] | None = None, + target: dict[str, Any] | None = None, + confidence: float = 0.0, + blocking_reason: str | None = None, + recommended_next_tools: list[str] | None = None, + native_channel: str = "window", + ) -> dict[str, Any]: + surface_state = self._update_surface_state( + window=window, + target=target, + confidence=confidence, + source_tool=native_channel, + ) + result: dict[str, Any] = { + "ok": ok, + "message": message, + "surface_kind": surface_state["surface_kind"], + "dialog_kind": surface_state["dialog_kind"], + "active_app": surface_state["active_app"], + "browser_workflow_state": surface_state["browser_workflow_state"], + "confidence": surface_state["confidence"], + "recommended_next_tools": recommended_next_tools + or self._recommended_tools_for_surface( + str(surface_state["surface_kind"]), + str(surface_state["dialog_kind"]), + ), + "native_automation_mode": self._native_automation_mode(), + "native_channel": native_channel, + } + if target is not None: + result["target"] = target + if blocking_reason: + result["blocking_reason"] = blocking_reason + if str(surface_state["surface_kind"]) == "secure_desktop" or str(surface_state["dialog_kind"]) == "secure_desktop": + self._emit( + "blocked_by_secure_desktop", + { + "message": message, + "native_channel": native_channel, + "target": target, + }, + ) + return result + + def _native_control_unavailable_result( + self, + *, + message: str, + blocking_reason: str = "native_automation_disabled", + ) -> dict[str, Any]: + self._emit( + "fallback_to_pixels", + { + "reason": blocking_reason, + "message": message, + "surface_kind": str(self.last_surface_state.get("surface_kind") or "unknown"), + "dialog_kind": str(self.last_surface_state.get("dialog_kind") or "none"), + }, + ) + return { + "ok": False, + "blocked": True, + "error": message, + "blocking_reason": blocking_reason, + "surface_kind": str(self.last_surface_state.get("surface_kind") or "unknown"), + "dialog_kind": str(self.last_surface_state.get("dialog_kind") or "none"), + "recommended_next_tools": ["get_active_window", "see_screen", "enhance"], + "native_automation_mode": self._native_automation_mode(), + } + + def _store_native_target(self, key: str, target: dict[str, Any] | None) -> None: + if not key or not isinstance(target, dict): + return + self.last_native_targets[key] = dict(target) + + def _match_text_filter(self, value: Any, expected: str) -> bool: + text = str(value or "").strip().lower() + wanted = str(expected or "").strip().lower() + if not wanted: + return True + return wanted in text + + def _resolve_target_handle(self, payload: Any) -> int: + if isinstance(payload, dict): + return self._parse_int(payload.get("handle"), default=0) + return self._parse_int(payload, default=0) + + def _describe_visual_context_entry( + self, + tool_name: str, + meta: dict[str, Any], + result: dict[str, Any] | None = None, + ) -> str: + path = str(meta.get("path") or "").strip() + captured_at = str(meta.get("captured_at") or "").strip() + if tool_name == "enhance": + source = meta.get("source_coord") if isinstance(meta.get("source_coord"), dict) else {} + source_x = self._parse_int(source.get("x"), default=0) + source_y = self._parse_int(source.get("y"), default=0) + region = str(meta.get("region") or "small").strip() + mode = str(meta.get("mode") or "ui").strip() + scale = self._parse_int(meta.get("scale"), default=0) + pieces = [f"enhance ok at=({source_x},{source_y})", f"region={region}", f"mode={mode}"] + if scale > 0: + pieces.append(f"scale={scale}") + else: + width = self._parse_int(meta.get("width"), default=0) + height = self._parse_int(meta.get("height"), default=0) + pieces = ["see_screen ok"] + if width > 0 and height > 0: + pieces.append(f"size={width}x{height}") + if captured_at: + pieces.append(f"captured_at={captured_at}") + if path: + pieces.append(f"path={path}") + if result and not bool(result.get("ok")): + error_text = str(result.get("error") or "").strip() + if error_text: + pieces.append(f"error={error_text[:120]}") + return " ".join(pieces) + + def _register_visual_context_message( + self, + message: dict[str, Any], + meta: dict[str, Any], + *, + tool_name: str, + result: dict[str, Any] | None = None, + ) -> None: + self.visual_context_sequence += 1 + self.visual_context_messages.append( + { + "message": message, + "meta": dict(meta), + "tool_name": tool_name, + "summary": self._describe_visual_context_entry(tool_name, meta, result), + "sequence": self.visual_context_sequence, + } + ) + budget = self._max_visual_context_images() + if budget <= 0: + self.visual_context_overflow_pending = bool(self.visual_context_messages) + self.visual_context_messages = [] + return + if len(self.visual_context_messages) > budget: + self.visual_context_overflow_pending = True + self.visual_context_messages = self._latest_visual_context_entries( + self.visual_context_messages, + budget, + ) + + def _parse_visual_context_timestamp(self, value: Any) -> float: + text = str(value or "").strip() + if not text: + return 0.0 + try: + return datetime.fromisoformat(text.replace("Z", "+00:00")).timestamp() + except Exception: # noqa: BLE001 + return 0.0 + + def _visual_context_sort_key(self, entry: dict[str, Any]) -> tuple[float, int, int]: + meta = entry.get("meta") if isinstance(entry.get("meta"), dict) else {} + captured_at = self._parse_visual_context_timestamp(meta.get("captured_at")) + path = str(meta.get("path") or "").strip() + file_mtime_ns = 0 + if path: + try: + file_mtime_ns = int(os.stat(path).st_mtime_ns) + except Exception: # noqa: BLE001 + file_mtime_ns = 0 + sequence = self._parse_int(entry.get("sequence"), default=0) + return (captured_at, file_mtime_ns, sequence) + + def _latest_visual_context_entries( + self, + entries: list[dict[str, Any]], + budget: int | None = None, + ) -> list[dict[str, Any]]: + limit = self._max_visual_context_images() if budget is None else max(0, int(budget)) + if limit <= 0 or not entries: + return [] + ordered = sorted(entries, key=self._visual_context_sort_key) + latest = ordered[-limit:] + return sorted(latest, key=self._visual_context_sort_key) + + def _latest_visual_context_summaries(self) -> list[str]: + summaries: list[str] = [] + for entry in self._latest_visual_context_entries(self.visual_context_messages): + summary = str(entry.get("summary") or "").strip() + if summary: + summaries.append(summary) + return summaries + + def _infer_target_filename(self, text: str) -> str: + raw_text = str(text or "") + prioritized_patterns = ( + r'(?i)\b(?:as|named|called)\s+["\']?([A-Za-z0-9][A-Za-z0-9._ -]{0,120}\.[A-Za-z0-9]{1,8})["\']?', + r'(?i)["\']([A-Za-z0-9][A-Za-z0-9._ -]{0,120}\.[A-Za-z0-9]{1,8})["\']', + ) + for pattern in prioritized_patterns: + matches = re.findall(pattern, raw_text) + if matches: + return str(matches[-1]).strip().lower() + matches = re.findall(r"(?i)\b([A-Za-z0-9][A-Za-z0-9._-]{0,120}\.[A-Za-z0-9]{1,8})\b", raw_text) + if not matches: + return "" + return str(matches[-1]).strip().lower() + + def _looks_like_save_dialog(self, window: dict[str, Any] | None) -> bool: + if not isinstance(window, dict): + return False + title = str(window.get("title") or "").strip().lower() + class_name = str(window.get("class_name") or "").strip().lower() + if title in {"save as", "save", "save as..."}: + return True + return class_name == "#32770" and "save" in title + + def _command_stdout_confirms_target(self, stdout: str, target_filename: str) -> bool: + lowered = str(stdout or "").strip().lower() + if not lowered: + return False + if any(token in lowered for token in ("missing", "not found", "cannot find", "false")): + return False + if target_filename and target_filename in lowered: + return True + return lowered in {"true", "exists", "ok"} + + def _record_completion_evidence( + self, + *, + kind: str, + category: str, + summary: str, + detail: dict[str, Any] | None = None, + ) -> dict[str, Any] | None: + existing = self.completion_evidence.get(kind) + if existing is not None: + existing["step"] = self.step + if detail: + existing["detail"] = detail + return None + evidence = { + "kind": kind, + "category": category, + "summary": summary, + "detail": detail or {}, + "step": self.step, + } + self.completion_evidence[kind] = evidence + self.logger.info("Completion evidence: %s", summary) + if self.options.pretty_logs: + self.logger.debug("Completion evidence detail:\n%s", json.dumps(evidence, ensure_ascii=False, indent=2)) + self._emit("completion_evidence", evidence) + return evidence + + def _completion_summary(self) -> str: + evidence_items = sorted( + self.completion_evidence.values(), + key=lambda item: (self._parse_int(item.get("step"), default=0), str(item.get("kind") or "")), + ) + if not evidence_items: + return "No completion evidence recorded." + return " | ".join(str(item.get("summary") or "").strip() for item in evidence_items if str(item.get("summary") or "").strip()) + + def _clear_finish_likely(self, reason: str) -> None: + if not bool(self.finish_likely_state.get("active")): + return + self.finish_likely_state.update( + { + "active": False, + "activated_at_step": 0, + "summary": "", + "fresh_verification_done": False, + "verification_step": 0, + "post_completion_visual_signature": "", + } + ) + self.logger.info("Finish-likely cleared: %s", reason) + self._emit("finish_likely_cleared", {"reason": reason, "step": self.step}) + + def _maybe_activate_finish_likely(self) -> dict[str, Any] | None: + if bool(self.finish_likely_state.get("active")): + return None + ui_signals = [ + item for item in self.completion_evidence.values() if str(item.get("category") or "") == "ui_signal" + ] + independent = [ + item for item in self.completion_evidence.values() if str(item.get("category") or "") == "independent_verifier" + ] + if not ui_signals or not independent: + return None + summary = self._completion_summary() + self.finish_likely_state.update( + { + "active": True, + "activated_at_step": self.step, + "summary": summary, + "fresh_verification_done": False, + "verification_step": 0, + "post_completion_visual_signature": str(self.last_visual_signature or ""), + } + ) + payload = { + "summary": summary, + "step": self.step, + "ui_signals": [str(item.get("kind") or "") for item in ui_signals], + "independent_verifiers": [str(item.get("kind") or "") for item in independent], + "target_filename": str(self.finish_likely_state.get("target_filename") or ""), + } + self.logger.info("Finish-likely activated: %s", summary) + if self.options.pretty_logs: + self.logger.debug("Finish-likely detail:\n%s", json.dumps(payload, ensure_ascii=False, indent=2)) + self._emit("finish_likely", payload) + return payload + + def _update_finish_likely_from_tool( + self, + tool_name: str, + args: dict[str, Any], + result: dict[str, Any], + ) -> dict[str, Any]: + if not bool(result.get("ok")): + return result + evidence: list[dict[str, Any]] = [] + target_filename = str(self.finish_likely_state.get("target_filename") or "").strip().lower() + if tool_name == "get_active_window": + window = result.get("window") if isinstance(result.get("window"), dict) else {} + title = str(window.get("title") or "").strip() + lowered_title = title.lower() + previous_window = self.last_observed_window + if target_filename and target_filename in lowered_title: + recorded = self._record_completion_evidence( + kind="active_window_title_matches_target", + category="ui_signal", + summary=f'Foreground window title contains "{target_filename}".', + detail={"title": title}, + ) + if recorded is not None: + evidence.append(recorded) + if self._looks_like_save_dialog(previous_window) and target_filename and target_filename in lowered_title: + recorded = self._record_completion_evidence( + kind="save_dialog_closed_to_target_window", + category="ui_signal", + summary=f'Save dialog closed and focus returned to "{title}".', + detail={ + "from_title": str((previous_window or {}).get("title") or ""), + "to_title": title, + }, + ) + if recorded is not None: + evidence.append(recorded) + if bool(self.finish_likely_state.get("active")) and target_filename and target_filename not in lowered_title: + if self._looks_like_save_dialog(window): + self._clear_finish_likely(f'Foreground window returned to "{title}" after completion evidence.') + self.last_observed_window = dict(window) + elif tool_name == "execute_command": + command = str(args.get("command") or "").strip() + stdout = str(result.get("stdout") or "").strip() + if target_filename and self._command_stdout_confirms_target(stdout, target_filename): + first_line = stdout.splitlines()[0].strip() if stdout.splitlines() else stdout + recorded = self._record_completion_evidence( + kind="command_confirms_target_exists", + category="independent_verifier", + summary=f'Command verification confirms "{target_filename}" exists.', + detail={"command": command, "stdout": first_line[:300]}, + ) + if recorded is not None: + evidence.append(recorded) + if bool(self.finish_likely_state.get("active")) and target_filename and stdout: + lowered_stdout = stdout.lower() + if any(token in lowered_stdout for token in ("missing", "not found", "cannot find", "false")): + self._clear_finish_likely(f'Command verification contradicted completion for "{target_filename}".') + elif tool_name == "see_screen": + meta = result.get("meta") if isinstance(result.get("meta"), dict) else {} + visual_signature = str(meta.get("visual_signature") or "").strip() + post_signature = str(self.finish_likely_state.get("post_completion_visual_signature") or "").strip() + if bool(self.finish_likely_state.get("active")) and visual_signature and post_signature and visual_signature == post_signature: + recorded = self._record_completion_evidence( + kind="verification_screenshot_matches_post_completion_state", + category="independent_verifier", + summary="Fresh verification screenshot matches the stable post-completion state.", + detail={"visual_signature": visual_signature}, + ) + if recorded is not None: + evidence.append(recorded) + if evidence: + result["completion_evidence"] = evidence + activation = self._maybe_activate_finish_likely() + if activation is not None: + result["finish_likely"] = activation + if bool(self.finish_likely_state.get("active")) and tool_name in FINISH_LIKELY_OBSERVATION_TOOLS: + if self.step > self._parse_int(self.finish_likely_state.get("activated_at_step"), default=0): + self.finish_likely_state["fresh_verification_done"] = True + self.finish_likely_state["verification_step"] = self.step + result["finish_likely_verification_done"] = True + return result + + def _clear_blocked_action_signatures(self) -> None: + for state in self.action_gate_state.values(): + if bool(state.get("blocked")): + state["attempts"] = 0 + state["awaiting_verification"] = False + state["blocked"] = False + + def _mark_observation(self, tool_name: str) -> None: + self._clear_blocked_action_signatures() + for state in self.action_gate_state.values(): + verifiers = set(state.get("required_verifiers") or []) + if tool_name in verifiers: + state["awaiting_verification"] = False + + def _verification_policy(self, tool_name: str, args: dict[str, Any]) -> dict[str, Any] | None: + if tool_name == "click": + coord = args.get("coordinate") if isinstance(args.get("coordinate"), dict) else {} + x = self._parse_int((coord or {}).get("x"), default=0) + y = self._parse_int((coord or {}).get("y"), default=0) + button = str(args.get("button", "left") or "left").strip().lower() or "left" + click_count = clamp(self._parse_int(args.get("click_count"), default=1), 1, 5) + return { + "signature": f"click:{x}:{y}:{button}:{click_count}", + "required_verifiers": {"see_screen", "enhance"}, + "hint": "Verify the visible UI change with see_screen or enhance before repeating the same click.", + } + if tool_name == "type": + text = str(args.get("text", "")) + preview = text[:60].replace("\n", "\\n") + return { + "signature": f"type:{preview}", + "required_verifiers": {"see_screen", "enhance", "get_active_window"}, + "hint": "Verify where the text landed before typing the same content again.", + } + if tool_name == "scroll": + amount = self._parse_int(args.get("amount"), default=0) + direction = str(args.get("direction", "") or "").strip().lower() + return { + "signature": f"scroll:{amount}:{direction}", + "required_verifiers": {"see_screen", "enhance"}, + "hint": "Verify the page or panel moved before repeating the same scroll.", + } + if tool_name == "drag": + start_coord = args.get("start_coordinate") if isinstance(args.get("start_coordinate"), dict) else {} + end_coord = args.get("end_coordinate") if isinstance(args.get("end_coordinate"), dict) else {} + start_x = self._parse_int((start_coord or {}).get("x"), default=0) + start_y = self._parse_int((start_coord or {}).get("y"), default=0) + end_x = self._parse_int((end_coord or {}).get("x"), default=0) + end_y = self._parse_int((end_coord or {}).get("y"), default=0) + button = str(args.get("button", "left") or "left").strip().lower() or "left" + return { + "signature": f"drag:{start_x}:{start_y}:{end_x}:{end_y}:{button}", + "required_verifiers": {"see_screen", "enhance", "get_cursor_position"}, + "hint": "Verify the dragged UI state before repeating the same drag path.", + } + if tool_name == "press_key": + combo = self._parse_key_combo(str(args.get("key", ""))) + if not combo: + return None + combo_text = "+".join(combo) + combo_set = set(combo) + if combo_text in {"ctrl+c", "ctrl+x"}: + return { + "signature": f"press_key:{combo_text}", + "required_verifiers": {"clipboard_get"}, + "hint": "Verify the clipboard with clipboard_get before retrying the same copy or cut shortcut.", + } + if combo_text in {"ctrl+v", "ctrl+s", "ctrl+w", "alt+f4"}: + return { + "signature": f"press_key:{combo_text}", + "required_verifiers": {"see_screen", "enhance", "get_active_window"}, + "hint": "Verify the save, paste, close, or window outcome before sending the same shortcut again.", + } + if {"alt", "tab"} <= combo_set or {"win", "tab"} <= combo_set or {"win", "r"} <= combo_set: + return { + "signature": f"press_key:{combo_text}", + "required_verifiers": {"get_active_window"}, + "hint": "Verify the foreground window with get_active_window before repeating the same focus or open shortcut.", + } + return { + "signature": f"press_key:{combo_text}", + "required_verifiers": {"see_screen", "enhance"}, + "hint": "Verify the visible UI change before repeating the same shortcut.", + } + if tool_name == "execute_command": + command = str(args.get("command", "") or "").strip() + if not self._is_background_launch_command(command): + return None + normalized = re.sub(r"\s+", " ", command).strip().lower() + return { + "signature": f"execute_command:{normalized}", + "required_verifiers": {"get_active_window", "see_screen"}, + "hint": ( + "Command-launched apps or URLs may stay in the background. " + "Verify focus with get_active_window or see_screen before retrying or typing." + ), + } + return None + + def _check_action_gate(self, tool_name: str, args: dict[str, Any]) -> tuple[dict[str, Any] | None, dict[str, Any] | None]: + policy = self._verification_policy(tool_name, args) + if policy is None: + return None, None + signature = str(policy["signature"]) + surface_signature = self._current_surface_signature() + state = self.action_gate_state.setdefault( + signature, + { + "attempts": 0, + "awaiting_verification": False, + "blocked": False, + "required_verifiers": set(policy.get("required_verifiers") or []), + "surface_signature": surface_signature, + }, + ) + if str(state.get("surface_signature") or "") != surface_signature: + state.update( + { + "attempts": 0, + "awaiting_verification": False, + "blocked": False, + "surface_signature": surface_signature, + } + ) + state["required_verifiers"] = set(policy.get("required_verifiers") or []) + if bool(state.get("blocked")): + hint = str(policy.get("hint") or "").strip() + blocked_result = { + "ok": False, + "blocked": True, + "error": ( + "Repeated ambiguous action signature is blocked until you gather a fresh observation. " + "Pivot instead of repeating the same action." + ), + "signature": signature, + "surface_signature": surface_signature, + "required_verifiers": sorted(state["required_verifiers"]), + "hint": hint, + } + return blocked_result, policy + if bool(state.get("awaiting_verification")): + verifier_text = ", ".join(sorted(state["required_verifiers"])) + hint = str(policy.get("hint") or "").strip() + blocked_result = { + "ok": False, + "blocked": True, + "error": f"Verify the last {tool_name} outcome with {verifier_text} before retrying the same action.", + "signature": signature, + "surface_signature": surface_signature, + "required_verifiers": sorted(state["required_verifiers"]), + "hint": hint, + } + return blocked_result, policy + retry_limit = max(1, int(self.options.max_retries_per_surface or MAX_ACTION_SIGNATURE_ATTEMPTS)) + if int(state.get("attempts", 0) or 0) >= retry_limit: + state["blocked"] = True + hint = str(policy.get("hint") or "").strip() + blocked_result = { + "ok": False, + "blocked": True, + "error": ( + f"This ambiguous action has already been attempted {retry_limit} time(s) on the same surface. " + "Use a fresh observation and change strategy." + ), + "signature": signature, + "surface_signature": surface_signature, + "required_verifiers": sorted(state["required_verifiers"]), + "hint": hint, + } + return blocked_result, policy + return None, policy + + def _check_finish_likely_gate(self, tool_name: str, args: dict[str, Any]) -> dict[str, Any] | None: + if not bool(self.finish_likely_state.get("active")): + return None + if tool_name == "task_complete": + return None + summary = str(self.finish_likely_state.get("summary") or "").strip() or self._completion_summary() + verification_done = bool(self.finish_likely_state.get("fresh_verification_done")) + if not verification_done and tool_name in FINISH_LIKELY_OBSERVATION_TOOLS: + return None + if verification_done and tool_name == "enhance": + return None + tool_hint = "Do one fresh verification pass, then finish." + if verification_done: + tool_hint = "A fresh verification pass already happened. Finish now unless you have contradictory evidence." + if tool_name == "execute_command": + tool_hint = "Do not re-search the filesystem or rerun save verification commands after completion evidence." + elif tool_name == "press_key": + combo_text = "+".join(self._parse_key_combo(str(args.get("key", "")))) + if combo_text: + tool_hint = f'Do not reopen menus or repeat "{combo_text}" after completion evidence.' + return { + "ok": False, + "blocked": True, + "blocked_reason": "finish_likely", + "error": ( + "Objective already appears satisfied from runtime evidence. " + "Stop re-exploring unchanged state and finish unless a new contradiction appears." + ), + "evidence_summary": summary, + "verification_done": verification_done, + "hint": tool_hint, + } + + def _window_signature(self, window: dict[str, Any] | None) -> str: + if not isinstance(window, dict) or not bool(window.get("available")): + return "" + hwnd = self._parse_int(window.get("hwnd"), default=0) + class_name = str(window.get("class_name") or "").strip() + title = str(window.get("title") or "").strip() + return f"{hwnd}|{class_name}|{title}" + + def _window_summary(self, window: dict[str, Any] | None) -> str: + if not isinstance(window, dict) or not bool(window.get("available")): + return "Unavailable window state" + title = str(window.get("title") or "").strip() or "(untitled)" + class_name = str(window.get("class_name") or "").strip() or "unknown" + return f"{title} [{class_name}]" + + def _current_surface_signature(self) -> str: + target = self.last_surface_state.get("target") if isinstance(self.last_surface_state, dict) else None + target_handle = self._parse_int((target or {}).get("handle") if isinstance(target, dict) else 0, default=0) + return "|".join( + [ + str(self.last_surface_state.get("surface_kind") or "unknown"), + str(self.last_surface_state.get("dialog_kind") or "none"), + str(self.last_surface_state.get("active_app") or ""), + str(target_handle), + ] + ) + + def _record_step_history( + self, + tool_names: list[str], + active_window: dict[str, Any] | None, + ) -> None: + if not tool_names: + return + self.step_history.append( + { + "step": self.step, + "tool_names": list(tool_names), + "window_signature": self._window_signature(active_window), + "window_summary": self._window_summary(active_window) if active_window else "", + "had_visual": any(name in VISUAL_TOOL_NAMES for name in tool_names), + } + ) + self.step_history = self.step_history[-12:] + + def _stable_observation_loop(self) -> dict[str, Any] | None: + recent = self.step_history[-MAX_STABLE_OBSERVATION_STEPS:] + if len(recent) < MAX_STABLE_OBSERVATION_STEPS: + return None + if any(not entry.get("tool_names") for entry in recent): + return None + if any(not set(entry["tool_names"]).issubset(OBSERVATION_TOOL_NAMES) for entry in recent): + return None + if any(not bool(entry.get("had_visual")) for entry in recent): + return None + signatures = {str(entry.get("window_signature") or "").strip() for entry in recent} + signatures.discard("") + if len(signatures) != 1: + return None + return { + "signature": next(iter(signatures)), + "window_summary": str(recent[-1].get("window_summary") or "").strip(), + "repeated_steps": len(recent), + } + + def _check_observation_loop_gate(self, tool_name: str) -> dict[str, Any] | None: + if tool_name not in {"see_screen", "get_active_window"}: + return None + stable_loop = self._stable_observation_loop() + if stable_loop is None: + return None + window_summary = str(stable_loop.get("window_summary") or "").strip() + repeated_steps = self._parse_int(stable_loop.get("repeated_steps"), default=MAX_STABLE_OBSERVATION_STEPS) + return { + "ok": False, + "blocked": True, + "blocked_reason": "observation_loop", + "error": ( + f"Repeated broad observation on the same unchanged foreground window is blocked after {repeated_steps} steps." + ), + "window_summary": window_summary, + "repeated_steps": repeated_steps, + "hint": ( + "Stop broad re-observation on the same unchanged window. " + "Act on the visible dialog, use enhance only if text is too small, or finish if the task is already complete." + ), + } + + def _note_action_attempt(self, policy: dict[str, Any]) -> None: + signature = str(policy["signature"]) + state = self.action_gate_state.setdefault(signature, {}) + state["attempts"] = int(state.get("attempts", 0) or 0) + 1 + state["awaiting_verification"] = True + state["blocked"] = False + state["required_verifiers"] = set(policy.get("required_verifiers") or []) + state["surface_signature"] = self._current_surface_signature() + + def _decorate_verification_result( + self, + result: dict[str, Any], + policy: dict[str, Any] | None, + ) -> dict[str, Any]: + if policy is None or not bool(result.get("ok")): + return result + verifiers = sorted(set(policy.get("required_verifiers") or [])) + if not verifiers: + return result + result["verification_required"] = True + result["verification_channels"] = verifiers + hint = str(policy.get("hint") or "").strip() + if hint: + result["next_step_hint"] = hint + return result + def _parse_px(self, value: Any) -> int: if value is None: return 0 @@ -388,14 +2031,508 @@ class ScreenJobAgent: except Exception: # noqa: BLE001 return default + def _clamp_screen_coordinate(self, x: int, y: int) -> tuple[int, int, int, int]: + width, height = pyautogui.size() + clamped_x = clamp(x, 0, max(0, width - 1)) + clamped_y = clamp(y, 0, max(0, height - 1)) + return clamped_x, clamped_y, width, height + + def _wait_after_action(self, seconds: float) -> None: + wait_remaining = seconds + while wait_remaining > 0: + if self._is_cancelled(): + break + interval = min(0.05, wait_remaining) + time.sleep(interval) + wait_remaining -= interval + + def _require_windows(self) -> None: + if os.name != "nt": + raise RuntimeError("This tool is currently implemented for Windows only.") + + def _get_process_executable_name(self, process_id: int) -> str: + self._require_windows() + if process_id <= 0: + return "" + kernel32 = ctypes.windll.kernel32 + process_query_limited_information = 0x1000 + process_handle = kernel32.OpenProcess(process_query_limited_information, False, process_id) + if not process_handle: + return "" + try: + buffer_len = ctypes.c_ulong(1024) + buffer = ctypes.create_unicode_buffer(buffer_len.value) + if not kernel32.QueryFullProcessImageNameW(process_handle, 0, buffer, ctypes.byref(buffer_len)): + return "" + full_path = buffer.value + return os.path.basename(full_path) + finally: + kernel32.CloseHandle(process_handle) + + def _get_window_text(self, hwnd: int) -> str: + self._require_windows() + user32 = ctypes.windll.user32 + title_length = int(user32.GetWindowTextLengthW(hwnd)) + title_buffer = ctypes.create_unicode_buffer(title_length + 1) + user32.GetWindowTextW(hwnd, title_buffer, len(title_buffer)) + return title_buffer.value + + def _get_class_name(self, hwnd: int) -> str: + self._require_windows() + user32 = ctypes.windll.user32 + class_buffer = ctypes.create_unicode_buffer(256) + user32.GetClassNameW(hwnd, class_buffer, len(class_buffer)) + return class_buffer.value + + def _window_rect_for_handle(self, hwnd: int) -> dict[str, int]: + self._require_windows() + from ctypes import wintypes + + class RECT(ctypes.Structure): + _fields_ = [ + ("left", wintypes.LONG), + ("top", wintypes.LONG), + ("right", wintypes.LONG), + ("bottom", wintypes.LONG), + ] + + user32 = ctypes.windll.user32 + rect = RECT() + if not user32.GetWindowRect(hwnd, ctypes.byref(rect)): + raise ctypes.WinError() + return { + "left": int(rect.left), + "top": int(rect.top), + "right": int(rect.right), + "bottom": int(rect.bottom), + "width": int(rect.right - rect.left), + "height": int(rect.bottom - rect.top), + } + + def _window_info_from_handle(self, hwnd: int) -> dict[str, Any]: + self._require_windows() + from ctypes import wintypes + + user32 = ctypes.windll.user32 + if not hwnd or not user32.IsWindow(hwnd): + return {"available": False} + process_id = wintypes.DWORD() + thread_id = int(user32.GetWindowThreadProcessId(hwnd, ctypes.byref(process_id))) + owner_hwnd = int(user32.GetWindow(hwnd, 4)) # GW_OWNER + title = self._get_window_text(hwnd) + class_name = self._get_class_name(hwnd) + rect = self._window_rect_for_handle(hwnd) + executable_name = self._get_process_executable_name(int(process_id.value)) + info = { + "available": True, + "hwnd": int(hwnd), + "title": title, + "class_name": class_name, + "thread_id": thread_id, + "process_id": int(process_id.value), + "executable_name": executable_name, + "is_visible": bool(user32.IsWindowVisible(hwnd)), + "is_enabled": bool(user32.IsWindowEnabled(hwnd)), + "is_minimized": bool(user32.IsIconic(hwnd)), + "is_maximized": bool(user32.IsZoomed(hwnd)), + "owner_hwnd": owner_hwnd, + "owner_title": self._get_window_text(owner_hwnd) if owner_hwnd else "", + "rect": rect, + } + surface_kind = self._surface_kind_from_window(info) + dialog_kind = self._dialog_kind_from_window(info) + info["surface_kind"] = surface_kind + info["dialog_kind"] = dialog_kind + info["browser_workflow_state"] = self._browser_workflow_state(info) + info["automation_available"] = surface_kind not in {"secure_desktop", "unknown"} + info["modal_owner_hwnd"] = owner_hwnd if dialog_kind != "none" else 0 + return info + + def _list_windows_info(self, *, visible_only: bool = True) -> list[dict[str, Any]]: + self._require_windows() + from ctypes import wintypes + + user32 = ctypes.windll.user32 + windows: list[dict[str, Any]] = [] + + enum_proc = ctypes.WINFUNCTYPE(ctypes.c_bool, wintypes.HWND, wintypes.LPARAM) + + def callback(hwnd: int, _: int) -> bool: + if visible_only and not user32.IsWindowVisible(hwnd): + return True + info = self._window_info_from_handle(int(hwnd)) + title = str(info.get("title") or "").strip() + rect = info.get("rect") if isinstance(info.get("rect"), dict) else {} + if visible_only and not title and self._parse_int(rect.get("width"), default=0) <= 0: + return True + windows.append(info) + return True + + if not user32.EnumWindows(enum_proc(callback), 0): + raise ctypes.WinError() + return windows + + def _find_window_info( + self, + *, + hwnd: int = 0, + title_contains: str = "", + class_name: str = "", + executable_name: str = "", + visible_only: bool = True, + ) -> dict[str, Any] | None: + title_filter = str(title_contains or "").strip().lower() + class_filter = str(class_name or "").strip().lower() + exe_filter = str(executable_name or "").strip().lower() + if hwnd: + candidate = self._window_info_from_handle(hwnd) + if not bool(candidate.get("available")): + return None + return candidate + windows = self._list_windows_info(visible_only=visible_only) + for window in windows: + if title_filter and title_filter not in str(window.get("title") or "").strip().lower(): + continue + if class_filter and class_filter != str(window.get("class_name") or "").strip().lower(): + continue + if exe_filter and exe_filter not in str(window.get("executable_name") or "").strip().lower(): + continue + return window + return None + + def _wait_for( + self, + *, + timeout_seconds: float, + predicate: Callable[[], Any], + interval_seconds: float = 0.15, + ) -> Any | None: + deadline = time.time() + max(0.1, timeout_seconds) + while time.time() <= deadline: + if self._is_cancelled(): + return None + value = predicate() + if value: + return value + time.sleep(interval_seconds) + return None + + def _set_foreground_window(self, hwnd: int) -> bool: + self._require_windows() + user32 = ctypes.windll.user32 + if not hwnd or not user32.IsWindow(hwnd): + return False + if user32.IsIconic(hwnd): + user32.ShowWindow(hwnd, 9) # SW_RESTORE + return bool(user32.SetForegroundWindow(hwnd)) + + def _close_window_handle(self, hwnd: int) -> bool: + self._require_windows() + user32 = ctypes.windll.user32 + if not hwnd or not user32.IsWindow(hwnd): + return False + wm_close = 0x0010 + return bool(user32.PostMessageW(hwnd, wm_close, 0, 0)) + + def _list_ui_elements_for_window( + self, + hwnd: int, + *, + include_hidden: bool = False, + ) -> list[dict[str, Any]]: + self._require_windows() + from ctypes import wintypes + + user32 = ctypes.windll.user32 + if not hwnd or not user32.IsWindow(hwnd): + return [] + elements: list[dict[str, Any]] = [] + enum_proc = ctypes.WINFUNCTYPE(ctypes.c_bool, wintypes.HWND, wintypes.LPARAM) + role_map = { + "button": "button", + "edit": "text_input", + "combobox": "combo_box", + "listbox": "list_box", + "syslistview32": "list_view", + "systabcontrol32": "tab", + } + + def callback(child_hwnd: int, _: int) -> bool: + if not include_hidden and not user32.IsWindowVisible(child_hwnd): + return True + class_name = self._get_class_name(int(child_hwnd)) + text = self._get_window_text(int(child_hwnd)) + normalized_class = class_name.strip().lower() + role = role_map.get(normalized_class, normalized_class or "control") + rect = self._window_rect_for_handle(int(child_hwnd)) + control_id = int(user32.GetDlgCtrlID(child_hwnd)) + element = { + "handle": int(child_hwnd), + "window_handle": int(hwnd), + "control_id": control_id, + "class_name": class_name, + "role": role, + "text": text, + "is_visible": bool(user32.IsWindowVisible(child_hwnd)), + "is_enabled": bool(user32.IsWindowEnabled(child_hwnd)), + "rect": rect, + "target": self._build_target_handle( + int(child_hwnd), + target_type="ui_element", + window_handle=int(hwnd), + control_id=control_id, + class_name=class_name, + text=text, + ), + } + elements.append(element) + return True + + user32.EnumChildWindows(hwnd, enum_proc(callback), 0) + return elements + + def _filter_ui_elements( + self, + elements: list[dict[str, Any]], + *, + text_contains: str = "", + class_name: str = "", + role: str = "", + ) -> list[dict[str, Any]]: + filtered: list[dict[str, Any]] = [] + for element in elements: + if text_contains and text_contains.lower() not in str(element.get("text") or "").strip().lower(): + continue + if class_name and class_name.lower() != str(element.get("class_name") or "").strip().lower(): + continue + if role and role.lower() != str(element.get("role") or "").strip().lower(): + continue + filtered.append(element) + return filtered + + def _find_dialog_info(self, *, title_contains: str = "") -> dict[str, Any] | None: + active = self._get_active_window_info() + candidates: list[dict[str, Any]] = [] + if bool(active.get("available")): + candidates.append(active) + candidates.extend(self._list_windows_info(visible_only=True)) + seen: set[int] = set() + for window in candidates: + hwnd = self._parse_int(window.get("hwnd"), default=0) + if hwnd in seen: + continue + seen.add(hwnd) + if self._dialog_kind_from_window(window) == "none": + continue + if title_contains and title_contains.lower() not in str(window.get("title") or "").strip().lower(): + continue + return window + return None + + def _find_dialog_button(self, dialog_hwnd: int, action: str) -> dict[str, Any] | None: + elements = self._list_ui_elements_for_window(dialog_hwnd, include_hidden=False) + text_map = { + "ok": ("ok",), + "open": ("open", "&open"), + "save": ("save", "&save"), + "cancel": ("cancel",), + "yes": ("yes", "&yes"), + "no": ("no", "&no"), + "close": ("close",), + } + wanted = text_map.get(action, (action,)) + for element in elements: + if str(element.get("role") or "") != "button": + continue + text = str(element.get("text") or "").strip().lower() + if any(text == candidate or candidate in text for candidate in wanted): + return element + return None + + def _invoke_window_control(self, handle: int) -> bool: + self._require_windows() + user32 = ctypes.windll.user32 + bm_click = 0x00F5 + return bool(user32.SendMessageW(handle, bm_click, 0, 0) or user32.PostMessageW(handle, bm_click, 0, 0)) + + def _set_window_value(self, handle: int, text: str) -> bool: + self._require_windows() + user32 = ctypes.windll.user32 + wm_settext = 0x000C + return bool(user32.SendMessageW(handle, wm_settext, 0, text)) + + def _clipboard_win32_api(self) -> tuple[Any, Any, Any]: + self._require_windows() + from ctypes import wintypes + + user32 = ctypes.WinDLL("user32", use_last_error=True) + kernel32 = ctypes.WinDLL("kernel32", use_last_error=True) + + user32.OpenClipboard.argtypes = [wintypes.HWND] + user32.OpenClipboard.restype = wintypes.BOOL + user32.CloseClipboard.argtypes = [] + user32.CloseClipboard.restype = wintypes.BOOL + user32.EmptyClipboard.argtypes = [] + user32.EmptyClipboard.restype = wintypes.BOOL + user32.IsClipboardFormatAvailable.argtypes = [ctypes.c_uint] + user32.IsClipboardFormatAvailable.restype = wintypes.BOOL + user32.GetClipboardData.argtypes = [ctypes.c_uint] + user32.GetClipboardData.restype = wintypes.HANDLE + user32.SetClipboardData.argtypes = [ctypes.c_uint, wintypes.HANDLE] + user32.SetClipboardData.restype = wintypes.HANDLE + user32.EnumClipboardFormats.argtypes = [ctypes.c_uint] + user32.EnumClipboardFormats.restype = ctypes.c_uint + user32.GetClipboardFormatNameW.argtypes = [ctypes.c_uint, wintypes.LPWSTR, ctypes.c_int] + user32.GetClipboardFormatNameW.restype = ctypes.c_int + + kernel32.GlobalAlloc.argtypes = [ctypes.c_uint, ctypes.c_size_t] + kernel32.GlobalAlloc.restype = wintypes.HGLOBAL + kernel32.GlobalLock.argtypes = [wintypes.HGLOBAL] + kernel32.GlobalLock.restype = wintypes.LPVOID + kernel32.GlobalUnlock.argtypes = [wintypes.HGLOBAL] + kernel32.GlobalUnlock.restype = wintypes.BOOL + kernel32.GlobalFree.argtypes = [wintypes.HGLOBAL] + kernel32.GlobalFree.restype = wintypes.HGLOBAL + + return user32, kernel32, wintypes + + def _open_clipboard_with_retry(self, user32: Any, hwnd: Any, attempts: int = 8, delay_seconds: float = 0.05) -> None: + for attempt in range(max(1, attempts)): + if user32.OpenClipboard(hwnd): + return + if attempt + 1 < attempts: + time.sleep(delay_seconds) + raise ctypes.WinError(ctypes.get_last_error()) + + def _clipboard_set_text_via_shell(self, text: str) -> None: + completed = subprocess.run( + [ + "powershell", + "-NoProfile", + "-Command", + "Set-Clipboard -Value ([Console]::In.ReadToEnd())", + ], + input=text, + text=True, + capture_output=True, + timeout=max(2, min(int(self.options.command_timeout), 30)), + check=False, + ) + if completed.returncode != 0: + stderr = (completed.stderr or completed.stdout or "").strip() + raise RuntimeError(stderr or "PowerShell clipboard fallback failed.") + + def _clipboard_get_text(self) -> str: + user32, kernel32, wintypes = self._clipboard_win32_api() + cf_unicode_text = 13 + + self._open_clipboard_with_retry(user32, wintypes.HWND(0)) + try: + if not user32.IsClipboardFormatAvailable(cf_unicode_text): + return "" + handle = user32.GetClipboardData(cf_unicode_text) + if not handle: + return "" + locked = kernel32.GlobalLock(handle) + if not locked: + raise ctypes.WinError(ctypes.get_last_error()) + try: + return ctypes.wstring_at(locked) + finally: + kernel32.GlobalUnlock(handle) + finally: + user32.CloseClipboard() + + def _clipboard_set_text(self, text: str) -> None: + user32, kernel32, wintypes = self._clipboard_win32_api() + cf_unicode_text = 13 + gmem_moveable = 0x0002 + buffer = ctypes.create_unicode_buffer(text) + size_bytes = ctypes.sizeof(buffer) + + self._open_clipboard_with_retry(user32, wintypes.HWND(0)) + handle = None + try: + if not user32.EmptyClipboard(): + raise ctypes.WinError(ctypes.get_last_error()) + handle = kernel32.GlobalAlloc(gmem_moveable, size_bytes) + if not handle: + raise ctypes.WinError(ctypes.get_last_error()) + locked = kernel32.GlobalLock(handle) + if not locked: + raise ctypes.WinError(ctypes.get_last_error()) + try: + ctypes.memmove(locked, buffer, size_bytes) + finally: + kernel32.GlobalUnlock(handle) + if not user32.SetClipboardData(cf_unicode_text, handle): + raise ctypes.WinError(ctypes.get_last_error()) + handle = None + finally: + if handle: + kernel32.GlobalFree(handle) + user32.CloseClipboard() + + def _clipboard_get_metadata(self) -> dict[str, Any]: + user32, _, wintypes = self._clipboard_win32_api() + known_formats = { + 1: "CF_TEXT", + 2: "CF_BITMAP", + 3: "CF_METAFILEPICT", + 8: "CF_DIB", + 13: "CF_UNICODETEXT", + 14: "CF_ENHMETAFILE", + 15: "CF_HDROP", + 17: "CF_DIBV5", + } + image_formats = {2, 8, 17} + formats: list[str] = [] + has_text = False + has_image = False + + self._open_clipboard_with_retry(user32, wintypes.HWND(0)) + try: + fmt = 0 + while True: + fmt = int(user32.EnumClipboardFormats(fmt)) + if fmt == 0: + break + label = known_formats.get(fmt) + if label is None: + name_buffer = ctypes.create_unicode_buffer(256) + read = int(user32.GetClipboardFormatNameW(fmt, name_buffer, len(name_buffer))) + label = name_buffer.value if read > 0 else f"FORMAT_{fmt}" + formats.append(label) + if fmt == 13: + has_text = True + if fmt in image_formats: + has_image = True + return { + "has_text": has_text, + "has_image": has_image, + "available_formats": formats, + } + finally: + user32.CloseClipboard() + + def _get_active_window_info(self) -> dict[str, Any]: + self._require_windows() + user32 = ctypes.windll.user32 + hwnd = user32.GetForegroundWindow() + if not hwnd: + return {"available": False} + return self._window_info_from_handle(int(hwnd)) + def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]: image, meta = self._capture_screen(with_grid=True) out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png" self._save_image(image, out_path) data_url = image_to_data_url(image, "PNG") + visual_signature = self._compute_visual_signature(image) + self.last_visual_signature = visual_signature self.last_screen_data_url = data_url - self.last_screen_meta = meta | {"path": str(out_path.resolve())} + self.last_screen_meta = meta | {"path": str(out_path.resolve()), "visual_signature": visual_signature} return { "ok": True, "path": str(out_path.resolve()), @@ -493,6 +2630,7 @@ class ScreenJobAgent: ) self._save_image(enhanced, out_path) data_url = image_to_data_url(enhanced, "PNG") + visual_signature = self._compute_visual_signature(enhanced) meta = { "captured_at": utc_now_iso(), @@ -507,7 +2645,9 @@ class ScreenJobAgent: "target_pixel": {"x": cx, "y": cy}, "screen_size": {"width": width, "height": height}, "base_capture_meta": base_meta, + "visual_signature": visual_signature, } + self.last_visual_signature = visual_signature self.last_screen_data_url = data_url self.last_screen_meta = meta return {"ok": True, "meta": meta, "message": "Enhanced view generated."} @@ -515,8 +2655,8 @@ class ScreenJobAgent: def _tool_click(self, args: dict[str, Any]) -> dict[str, Any]: coord = args.get("coordinate") or {} offset = args.get("offset") or {} - base_x = int(coord.get("x", 0)) - base_y = int(coord.get("y", 0)) + base_x = self._parse_int(coord.get("x", 0), default=0) + base_y = self._parse_int(coord.get("y", 0), default=0) legacy_dx = self._parse_px(offset.get("x", 0)) legacy_dy = self._parse_px(offset.get("y", 0)) @@ -524,12 +2664,20 @@ class ScreenJobAgent: down = self._parse_px(args.get("offset_down", 0)) left = self._parse_px(args.get("offset_left", 0)) right = self._parse_px(args.get("offset_right", 0)) + button = str(args.get("button", "left") or "left").strip().lower() + if button not in {"left", "middle", "right"}: + button = "left" + click_count = clamp(self._parse_int(args.get("click_count"), default=1), 1, 5) + interval_seconds = self._parse_seconds(args.get("interval_seconds"), default=0.0, max_seconds=5.0) + duration_seconds = self._parse_seconds( + args.get("duration_seconds"), + default=self.options.click_pause, + max_seconds=5.0, + ) x = base_x + legacy_dx + right - left y = base_y + legacy_dy + down - up - width, height = pyautogui.size() - x = clamp(x, 0, max(0, width - 1)) - y = clamp(y, 0, max(0, height - 1)) + x, y, width, height = self._clamp_screen_coordinate(x, y) now = time.time() self.click_history.append((x, y, now)) @@ -546,27 +2694,26 @@ class ScreenJobAgent: "blocked": True, "error": ( "Repeated click loop detected at nearly same coordinate. " - "Switch strategy: call see_screen/enhance and use execute_command." + "Do not retry blindly. Re-observe with see_screen/enhance, verify focus with get_active_window, " + "and switch strategy before acting again." ), "clicked": {"x": x, "y": y}, "recent_similar_clicks": len(near_same), } - pyautogui.moveTo(x, y, duration=self.options.click_pause) - pyautogui.click(x=x, y=y) + pyautogui.moveTo(x, y, duration=duration_seconds) + pyautogui.click(x=x, y=y, clicks=click_count, interval=interval_seconds, button=button) sleep_after = self._parse_seconds(args.get("sleep_after_seconds", 0), default=0.0, max_seconds=30.0) - wait_remaining = sleep_after if sleep_after > 0 else 0.15 - while wait_remaining > 0: - if self._is_cancelled(): - break - interval = min(0.05, wait_remaining) - time.sleep(interval) - wait_remaining -= interval + self._wait_after_action(sleep_after if sleep_after > 0 else 0.15) return { "ok": True, "clicked": {"x": x, "y": y}, "base_coordinate": {"x": base_x, "y": base_y}, + "button": button, + "click_count": click_count, + "interval_seconds": interval_seconds, + "duration_seconds": duration_seconds, "applied_offset": { "legacy": {"x": legacy_dx, "y": legacy_dy}, "directional": {"up": up, "down": down, "left": left, "right": right}, @@ -576,6 +2723,830 @@ class ScreenJobAgent: "message": "Click executed.", } + def _tool_scroll(self, args: dict[str, Any]) -> dict[str, Any]: + amount = self._parse_int(args.get("amount"), default=0) + direction = str(args.get("direction", "") or "").strip().lower() + if direction == "up": + applied_amount = abs(amount) + elif direction == "down": + applied_amount = -abs(amount) + else: + applied_amount = amount + + moved_to = None + coordinate = args.get("coordinate") + width, height = pyautogui.size() + if isinstance(coordinate, dict): + target_x = self._parse_int(coordinate.get("x"), default=0) + target_y = self._parse_int(coordinate.get("y"), default=0) + x, y, width, height = self._clamp_screen_coordinate(target_x, target_y) + pyautogui.moveTo(x, y, duration=self.options.click_pause) + moved_to = {"x": x, "y": y} + + pyautogui.scroll(applied_amount) + sleep_after = self._parse_seconds(args.get("sleep_after_seconds"), default=0.0, max_seconds=30.0) + if sleep_after > 0: + self._wait_after_action(sleep_after) + return { + "ok": True, + "amount": applied_amount, + "direction": "up" if applied_amount >= 0 else "down", + "moved_to": moved_to, + "screen_size": {"width": width, "height": height}, + "sleep_after_seconds": sleep_after, + "message": "Scroll executed.", + } + + def _tool_drag(self, args: dict[str, Any]) -> dict[str, Any]: + start_coord = args.get("start_coordinate") or {} + end_coord = args.get("end_coordinate") or {} + start_x = self._parse_int(start_coord.get("x"), default=0) + start_y = self._parse_int(start_coord.get("y"), default=0) + end_x = self._parse_int(end_coord.get("x"), default=0) + end_y = self._parse_int(end_coord.get("y"), default=0) + button = str(args.get("button", "left") or "left").strip().lower() + if button not in {"left", "middle", "right"}: + button = "left" + duration_seconds = self._parse_seconds(args.get("duration_seconds"), default=0.2, max_seconds=10.0) + clamped_start_x, clamped_start_y, width, height = self._clamp_screen_coordinate(start_x, start_y) + clamped_end_x, clamped_end_y, _, _ = self._clamp_screen_coordinate(end_x, end_y) + + pyautogui.moveTo(clamped_start_x, clamped_start_y, duration=self.options.click_pause) + pyautogui.dragTo(clamped_end_x, clamped_end_y, duration=duration_seconds, button=button) + sleep_after = self._parse_seconds(args.get("sleep_after_seconds"), default=0.0, max_seconds=30.0) + if sleep_after > 0: + self._wait_after_action(sleep_after) + return { + "ok": True, + "from": {"x": clamped_start_x, "y": clamped_start_y}, + "to": {"x": clamped_end_x, "y": clamped_end_y}, + "requested_from": {"x": start_x, "y": start_y}, + "requested_to": {"x": end_x, "y": end_y}, + "button": button, + "duration_seconds": duration_seconds, + "sleep_after_seconds": sleep_after, + "screen_size": {"width": width, "height": height}, + "message": "Drag executed.", + } + + def _tool_move_mouse(self, args: dict[str, Any]) -> dict[str, Any]: + coord = args.get("coordinate") or {} + requested_x = self._parse_int(coord.get("x"), default=0) + requested_y = self._parse_int(coord.get("y"), default=0) + x, y, width, height = self._clamp_screen_coordinate(requested_x, requested_y) + duration_seconds = self._parse_seconds( + args.get("duration_seconds"), + default=self.options.click_pause, + max_seconds=5.0, + ) + pyautogui.moveTo(x, y, duration=duration_seconds) + return { + "ok": True, + "requested_coordinate": {"x": requested_x, "y": requested_y}, + "moved_to": {"x": x, "y": y}, + "duration_seconds": duration_seconds, + "screen_size": {"width": width, "height": height}, + "message": "Mouse moved.", + } + + def _tool_clipboard_get(self, _: dict[str, Any]) -> dict[str, Any]: + text = self._clipboard_get_text() + try: + metadata = self._clipboard_get_metadata() + except Exception: # noqa: BLE001 + metadata = { + "has_text": bool(text), + "has_image": False, + "available_formats": ["CF_UNICODETEXT"] if text else [], + } + has_text = bool(metadata.get("has_text")) or bool(text) + available_formats = metadata.get("available_formats") + if not isinstance(available_formats, list): + available_formats = [] + return { + "ok": True, + "text": text, + "length": len(text), + "has_text": has_text, + "has_image": bool(metadata.get("has_image")), + "available_formats": [str(item) for item in available_formats], + "message": "Clipboard state read.", + } + + def _tool_clipboard_set(self, args: dict[str, Any]) -> dict[str, Any]: + text = str(args.get("text", "")) + try: + self._clipboard_set_text(text) + return { + "ok": True, + "length": len(text), + "message": "Clipboard text updated.", + } + except Exception as native_exc: # noqa: BLE001 + try: + self._clipboard_set_text_via_shell(text) + return { + "ok": True, + "length": len(text), + "used_shell_fallback": True, + "message": "Clipboard text updated via PowerShell fallback.", + "native_error": f"{type(native_exc).__name__}: {native_exc}", + } + except Exception as fallback_exc: # noqa: BLE001 + return { + "ok": False, + "length": len(text), + "error": ( + f"Clipboard update failed. Native path: {type(native_exc).__name__}: {native_exc}. " + f"Fallback path: {type(fallback_exc).__name__}: {fallback_exc}" + ), + } + + def _tool_get_cursor_position(self, _: dict[str, Any]) -> dict[str, Any]: + position = pyautogui.position() + return { + "ok": True, + "position": {"x": int(position.x), "y": int(position.y)}, + "message": "Cursor position captured.", + } + + def _tool_list_windows(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native window enumeration is disabled for this run.", + ) + visible_only = bool(args.get("visible_only", True)) + windows = self._list_windows_info(visible_only=visible_only) + active = self._get_active_window_info() + result = self._build_native_result( + ok=True, + message="Window list captured.", + window=active, + confidence=0.93 if windows else 0.2, + native_channel="window", + ) + result["windows"] = windows + result["count"] = len(windows) + return result + + def _tool_find_window(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native window lookup is disabled for this run.", + ) + window = self._find_window_info( + hwnd=self._parse_int(args.get("hwnd"), default=0), + title_contains=str(args.get("title_contains") or ""), + class_name=str(args.get("class_name") or ""), + executable_name=str(args.get("executable_name") or ""), + visible_only=bool(args.get("visible_only", True)), + ) + if window is None: + return { + **self._build_native_result( + ok=False, + message="Window not found.", + window=self._get_active_window_info(), + confidence=0.2, + native_channel="window", + ), + "error": "No matching window found.", + } + target = self._build_target_handle( + self._parse_int(window.get("hwnd"), default=0), + target_type="window", + class_name=str(window.get("class_name") or ""), + text=str(window.get("title") or ""), + ) + self._store_native_target("window", target) + result = self._build_native_result( + ok=True, + message="Matching window found.", + window=window, + target=target, + confidence=0.96, + native_channel="window", + ) + result["window"] = window + return result + + def _tool_focus_window(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native window focusing is disabled for this run.", + ) + window = self._find_window_info( + hwnd=self._parse_int(args.get("hwnd"), default=0), + title_contains=str(args.get("title_contains") or ""), + class_name=str(args.get("class_name") or ""), + executable_name=str(args.get("executable_name") or ""), + visible_only=True, + ) + if window is None: + return { + **self._build_native_result( + ok=False, + message="Target window not found.", + window=self._get_active_window_info(), + confidence=0.2, + native_channel="window", + ), + "error": "No matching window to focus.", + } + hwnd = self._parse_int(window.get("hwnd"), default=0) + if not self._set_foreground_window(hwnd): + return { + **self._build_native_result( + ok=False, + message="Window focus request failed.", + window=window, + confidence=0.35, + native_channel="window", + ), + "error": "Failed to bring the target window to the foreground.", + } + waited = self._wait_for( + timeout_seconds=self.options.focus_timeout_seconds, + predicate=lambda: ( + matched + if (matched := self._get_active_window_info()) and self._parse_int(matched.get("hwnd"), default=0) == hwnd + else None + ), + ) + active = waited if isinstance(waited, dict) else self._get_active_window_info() + target = self._build_target_handle( + hwnd, + target_type="window", + class_name=str(window.get("class_name") or ""), + text=str(window.get("title") or ""), + ) + self._store_native_target("window", target) + self._emit( + "window_focused", + { + "hwnd": hwnd, + "title": str(window.get("title") or ""), + "class_name": str(window.get("class_name") or ""), + "executable_name": str(window.get("executable_name") or ""), + }, + ) + result = self._build_native_result( + ok=True, + message="Window focus verified.", + window=active, + target=target, + confidence=0.97 if self._parse_int(active.get("hwnd"), default=0) == hwnd else 0.7, + native_channel="window", + ) + result["window"] = active + return result + + def _tool_close_window(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native window closing is disabled for this run.", + ) + window = self._find_window_info( + hwnd=self._parse_int(args.get("hwnd"), default=0), + title_contains=str(args.get("title_contains") or ""), + class_name=str(args.get("class_name") or ""), + visible_only=True, + ) + if window is None: + return {"ok": False, "error": "No matching window to close."} + hwnd = self._parse_int(window.get("hwnd"), default=0) + if not self._close_window_handle(hwnd): + return {"ok": False, "error": "WM_CLOSE could not be posted to the target window."} + result = self._build_native_result( + ok=True, + message="Window close requested.", + window=window, + target=self._build_target_handle(hwnd, target_type="window", class_name=str(window.get("class_name") or "")), + confidence=0.9, + native_channel="window", + ) + result["window"] = window + return result + + def _tool_wait_for_window(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native window waits are disabled for this run.", + ) + timeout_seconds = self._parse_seconds( + args.get("timeout_seconds"), + default=self.options.dialog_timeout_seconds, + max_seconds=120.0, + ) + window = self._wait_for( + timeout_seconds=timeout_seconds, + predicate=lambda: self._find_window_info( + title_contains=str(args.get("title_contains") or ""), + class_name=str(args.get("class_name") or ""), + executable_name=str(args.get("executable_name") or ""), + visible_only=bool(args.get("visible_only", True)), + ), + ) + if not isinstance(window, dict): + return { + "ok": False, + "error": "Timed out waiting for matching window.", + "timeout_seconds": timeout_seconds, + "surface_kind": str(self.last_surface_state.get("surface_kind") or "unknown"), + "dialog_kind": str(self.last_surface_state.get("dialog_kind") or "none"), + "recommended_next_tools": ["get_active_window", "see_screen", "enhance"], + } + target = self._build_target_handle( + self._parse_int(window.get("hwnd"), default=0), + target_type="window", + class_name=str(window.get("class_name") or ""), + text=str(window.get("title") or ""), + ) + self._store_native_target("window", target) + result = self._build_native_result( + ok=True, + message="Matching window appeared.", + window=window, + target=target, + confidence=0.95, + native_channel="window", + ) + result["window"] = window + result["timeout_seconds"] = timeout_seconds + return result + + def _tool_wait_for_focus_change(self, args: dict[str, Any]) -> dict[str, Any]: + timeout_seconds = self._parse_seconds( + args.get("timeout_seconds"), + default=self.options.focus_timeout_seconds, + max_seconds=120.0, + ) + starting = self._get_active_window_info() + start_hwnd = self._parse_int(starting.get("hwnd"), default=0) + + def predicate() -> dict[str, Any] | None: + current = self._get_active_window_info() + current_hwnd = self._parse_int(current.get("hwnd"), default=0) + if current_hwnd == 0 or current_hwnd == start_hwnd: + return None + if args.get("title_contains") and not self._match_text_filter(current.get("title"), str(args.get("title_contains") or "")): + return None + if args.get("class_name") and str(current.get("class_name") or "").strip().lower() != str(args.get("class_name") or "").strip().lower(): + return None + if args.get("executable_name") and not self._match_text_filter(current.get("executable_name"), str(args.get("executable_name") or "")): + return None + return current + + changed = self._wait_for(timeout_seconds=timeout_seconds, predicate=predicate) + if not isinstance(changed, dict): + return { + "ok": False, + "error": "Timed out waiting for foreground focus change.", + "timeout_seconds": timeout_seconds, + "surface_kind": str(self.last_surface_state.get("surface_kind") or "unknown"), + "dialog_kind": str(self.last_surface_state.get("dialog_kind") or "none"), + "recommended_next_tools": ["get_active_window", "see_screen"], + } + target = self._build_target_handle( + self._parse_int(changed.get("hwnd"), default=0), + target_type="window", + class_name=str(changed.get("class_name") or ""), + text=str(changed.get("title") or ""), + ) + self._store_native_target("window", target) + self._emit( + "window_focused", + { + "hwnd": self._parse_int(changed.get("hwnd"), default=0), + "title": str(changed.get("title") or ""), + "class_name": str(changed.get("class_name") or ""), + "executable_name": str(changed.get("executable_name") or ""), + }, + ) + result = self._build_native_result( + ok=True, + message="Foreground focus changed.", + window=changed, + target=target, + confidence=0.95, + native_channel="window", + ) + result["window"] = changed + result["timeout_seconds"] = timeout_seconds + return result + + def _tool_detect_dialog(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native dialog detection is disabled for this run.", + ) + dialog = self._find_dialog_info(title_contains=str(args.get("title_contains") or "")) + active = self._get_active_window_info() + if dialog is None: + result = self._build_native_result( + ok=True, + message="No native dialog detected.", + window=active, + confidence=0.82, + native_channel="dialog", + ) + result["dialog"] = None + return result + target = self._build_target_handle( + self._parse_int(dialog.get("hwnd"), default=0), + target_type="dialog", + class_name=str(dialog.get("class_name") or ""), + text=str(dialog.get("title") or ""), + ) + self._store_native_target("dialog", target) + result = self._build_native_result( + ok=True, + message="Dialog detected.", + window=dialog, + target=target, + confidence=0.97, + native_channel="dialog", + ) + result["dialog"] = dialog + result["buttons"] = [ + { + "text": str(element.get("text") or ""), + "target": element.get("target"), + } + for element in self._filter_ui_elements( + self._list_ui_elements_for_window(self._parse_int(dialog.get("hwnd"), default=0)), + role="button", + ) + ] + return result + + def _tool_dialog_action(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native dialog actions are disabled for this run.", + ) + action = str(args.get("action") or "").strip().lower() + dialog = self._find_dialog_info() + if self._parse_int(args.get("hwnd"), default=0): + dialog = self._window_info_from_handle(self._parse_int(args.get("hwnd"), default=0)) + if dialog is None or not bool(dialog.get("available")): + return {"ok": False, "error": "No dialog available for dialog_action."} + hwnd = self._parse_int(dialog.get("hwnd"), default=0) + button = self._find_dialog_button(hwnd, action) + invoked = False + if button is not None: + invoked = self._invoke_window_control(self._parse_int(button.get("handle"), default=0)) + if not invoked: + fallback_keys = { + "cancel": "esc", + "close": "esc", + "ok": "enter", + "open": "enter", + "save": "enter", + "yes": "alt+y", + "no": "alt+n", + } + key = fallback_keys.get(action, "") + if key: + key_result = self._tool_press_key({"key": key}) + invoked = bool(key_result.get("ok")) + if not invoked: + return { + **self._build_native_result( + ok=False, + message="Dialog action could not be invoked.", + window=dialog, + target=self._build_target_handle(hwnd, target_type="dialog"), + confidence=0.35, + native_channel="dialog", + ), + "error": f"Could not invoke dialog action '{action}'.", + } + result = self._build_native_result( + ok=True, + message="Dialog action invoked.", + window=dialog, + target=self._build_target_handle(hwnd, target_type="dialog"), + confidence=0.9, + native_channel="dialog", + ) + result["action"] = action + if button is not None: + result["button"] = {"text": str(button.get("text") or ""), "target": button.get("target")} + return result + + def _tool_dialog_set_filename(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native dialog filename entry is disabled for this run.", + ) + filename = str(args.get("filename") or "") + if not filename: + return {"ok": False, "error": "Missing filename."} + dialog = self._find_dialog_info() + if self._parse_int(args.get("hwnd"), default=0): + dialog = self._window_info_from_handle(self._parse_int(args.get("hwnd"), default=0)) + if dialog is None or not bool(dialog.get("available")): + return {"ok": False, "error": "No dialog available for dialog_set_filename."} + hwnd = self._parse_int(dialog.get("hwnd"), default=0) + edit_candidates = self._filter_ui_elements( + self._list_ui_elements_for_window(hwnd), + role="text_input", + ) + if not edit_candidates: + return { + **self._build_native_result( + ok=False, + message="Dialog filename field not found.", + window=dialog, + target=self._build_target_handle(hwnd, target_type="dialog"), + confidence=0.35, + native_channel="dialog", + ), + "error": "No visible edit control found in the dialog.", + } + edit = edit_candidates[0] + handle = self._parse_int(edit.get("handle"), default=0) + if not self._set_window_value(handle, filename): + return {"ok": False, "error": "Failed to set dialog filename field."} + result = self._build_native_result( + ok=True, + message="Dialog filename set.", + window=dialog, + target=edit.get("target") if isinstance(edit.get("target"), dict) else None, + confidence=0.93, + native_channel="dialog", + ) + result["filename"] = filename + result["element"] = edit + return result + + def _tool_wait_for_dialog_close(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native dialog waits are disabled for this run.", + ) + hwnd = self._parse_int(args.get("hwnd"), default=0) + timeout_seconds = self._parse_seconds( + args.get("timeout_seconds"), + default=self.options.dialog_timeout_seconds, + max_seconds=120.0, + ) + dialog = self._window_info_from_handle(hwnd) if hwnd else self._find_dialog_info() + dialog_hwnd = self._parse_int((dialog or {}).get("hwnd"), default=0) + closed = self._wait_for( + timeout_seconds=timeout_seconds, + predicate=lambda: self._get_active_window_info() + if dialog_hwnd and self._find_window_info(hwnd=dialog_hwnd, visible_only=False) is None + else None, + ) + if not isinstance(closed, dict): + return { + "ok": False, + "error": "Timed out waiting for dialog to close.", + "timeout_seconds": timeout_seconds, + "dialog_hwnd": dialog_hwnd, + "recommended_next_tools": ["detect_dialog", "see_screen", "enhance"], + } + result = self._build_native_result( + ok=True, + message="Dialog closed.", + window=closed, + confidence=0.94, + native_channel="dialog", + ) + result["closed_dialog_hwnd"] = dialog_hwnd + result["window"] = closed + return result + + def _resolve_ui_scope_window(self, args: dict[str, Any]) -> dict[str, Any] | None: + window_payload = args.get("window") if isinstance(args.get("window"), dict) else None + if window_payload and self._parse_int(window_payload.get("handle"), default=0): + return self._window_info_from_handle(self._parse_int(window_payload.get("handle"), default=0)) + scope = str(args.get("scope") or "active_window").strip().lower() + if scope == "dialog": + return self._find_dialog_info() + return self._get_active_window_info() + + def _tool_list_ui_elements(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native UI element listing is disabled for this run.", + ) + window = self._resolve_ui_scope_window(args) + if window is None or not bool(window.get("available")): + return {"ok": False, "error": "No target window available for list_ui_elements."} + elements = self._filter_ui_elements( + self._list_ui_elements_for_window( + self._parse_int(window.get("hwnd"), default=0), + include_hidden=bool(args.get("include_hidden", False)), + ), + text_contains=str(args.get("text_contains") or ""), + class_name=str(args.get("class_name") or ""), + role=str(args.get("role") or ""), + ) + if elements: + self._emit( + "ui_element_found", + { + "window_hwnd": self._parse_int(window.get("hwnd"), default=0), + "count": len(elements), + "role": str(args.get("role") or ""), + "text_contains": str(args.get("text_contains") or ""), + }, + ) + result = self._build_native_result( + ok=True, + message="UI elements listed.", + window=window, + confidence=0.9 if elements else 0.4, + native_channel="ui_element", + ) + result["window"] = window + result["elements"] = elements + result["count"] = len(elements) + return result + + def _tool_invoke_ui_element(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native UI element invocation is disabled for this run.", + ) + element = args.get("element") if isinstance(args.get("element"), dict) else {} + handle = self._resolve_target_handle(element) + if not handle: + return {"ok": False, "error": "Missing UI element handle."} + if not self._invoke_window_control(handle): + return {"ok": False, "error": "Failed to invoke native UI element."} + window = self._window_info_from_handle(self._parse_int(element.get("window_handle"), default=0)) if element else self._get_active_window_info() + result = self._build_native_result( + ok=True, + message="UI element invoked.", + window=window, + target=dict(element) if element else None, + confidence=0.88, + native_channel="ui_element", + ) + result["element"] = element + return result + + def _tool_set_ui_element_value(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native UI element value setting is disabled for this run.", + ) + element = args.get("element") if isinstance(args.get("element"), dict) else {} + handle = self._resolve_target_handle(element) + text = str(args.get("text") or "") + if not handle: + return {"ok": False, "error": "Missing UI element handle."} + if not self._set_window_value(handle, text): + return {"ok": False, "error": "Failed to set native UI element value."} + window = self._window_info_from_handle(self._parse_int(element.get("window_handle"), default=0)) if element else self._get_active_window_info() + result = self._build_native_result( + ok=True, + message="UI element value set.", + window=window, + target=dict(element) if element else None, + confidence=0.9, + native_channel="ui_element", + ) + result["element"] = element + result["text"] = text + return result + + def _tool_select_ui_element(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native UI element selection is disabled for this run.", + ) + self._require_windows() + user32 = ctypes.windll.user32 + element = args.get("element") if isinstance(args.get("element"), dict) else {} + handle = self._resolve_target_handle(element) + if not handle: + return {"ok": False, "error": "Missing UI element handle."} + class_name = str(element.get("class_name") or "").strip().lower() + text = str(args.get("text") or "") + index = self._parse_int(args.get("index"), default=-1) + changed = False + if class_name == "combobox": + cb_selectstring = 0x014D + cb_setcursel = 0x014E + if text: + changed = bool(user32.SendMessageW(handle, cb_selectstring, -1, text)) + elif index >= 0: + changed = bool(user32.SendMessageW(handle, cb_setcursel, index, 0) != -1) + elif class_name == "listbox": + lb_selectstring = 0x018C + lb_setcursel = 0x0186 + if text: + changed = bool(user32.SendMessageW(handle, lb_selectstring, -1, text)) + elif index >= 0: + changed = bool(user32.SendMessageW(handle, lb_setcursel, index, 0) != -1) + if not changed: + return { + "ok": False, + "error": "Selection failed or the control class is not supported for native selection.", + } + window = self._window_info_from_handle(self._parse_int(element.get("window_handle"), default=0)) if element else self._get_active_window_info() + result = self._build_native_result( + ok=True, + message="UI element selection applied.", + window=window, + target=dict(element) if element else None, + confidence=0.86, + native_channel="ui_element", + ) + result["element"] = element + if text: + result["text"] = text + if index >= 0: + result["index"] = index + return result + + def _tool_wait_for_ui_element(self, args: dict[str, Any]) -> dict[str, Any]: + if not self._native_control_tools_enabled(): + return self._native_control_unavailable_result( + message="Native UI element waits are disabled for this run.", + ) + timeout_seconds = self._parse_seconds( + args.get("timeout_seconds"), + default=self.options.ui_element_timeout_seconds, + max_seconds=120.0, + ) + + def predicate() -> dict[str, Any] | None: + window = self._resolve_ui_scope_window(args) + if window is None or not bool(window.get("available")): + return None + elements = self._filter_ui_elements( + self._list_ui_elements_for_window( + self._parse_int(window.get("hwnd"), default=0), + include_hidden=bool(args.get("include_hidden", False)), + ), + text_contains=str(args.get("text_contains") or ""), + class_name=str(args.get("class_name") or ""), + role=str(args.get("role") or ""), + ) + if not elements: + return None + return {"window": window, "element": elements[0], "count": len(elements)} + + matched = self._wait_for(timeout_seconds=timeout_seconds, predicate=predicate) + if not isinstance(matched, dict): + return { + "ok": False, + "error": "Timed out waiting for matching UI element.", + "timeout_seconds": timeout_seconds, + "recommended_next_tools": ["list_ui_elements", "detect_dialog", "see_screen", "enhance"], + } + self._emit( + "ui_element_found", + { + "window_hwnd": self._parse_int(((matched.get("window") or {}).get("hwnd")), default=0), + "count": self._parse_int(matched.get("count"), default=1), + "role": str(((matched.get("element") or {}).get("role")) or ""), + "text": str(((matched.get("element") or {}).get("text")) or ""), + }, + ) + result = self._build_native_result( + ok=True, + message="Matching UI element appeared.", + window=matched.get("window") if isinstance(matched.get("window"), dict) else None, + target=matched.get("element", {}).get("target") if isinstance(matched.get("element"), dict) else None, + confidence=0.92, + native_channel="ui_element", + ) + result["window"] = matched.get("window") + result["element"] = matched.get("element") + result["count"] = matched.get("count") + result["timeout_seconds"] = timeout_seconds + return result + + def _tool_get_active_window(self, _: dict[str, Any]) -> dict[str, Any]: + window = self._get_active_window_info() + target = None + if bool(window.get("available")): + target = self._build_target_handle( + self._parse_int(window.get("hwnd"), default=0), + target_type="window", + class_name=str(window.get("class_name") or ""), + text=str(window.get("title") or ""), + ) + self._store_native_target("window", target) + return { + **self._build_native_result( + ok=True, + message="Active window captured.", + window=window, + target=target, + confidence=0.98 if bool(window.get("available")) else 0.2, + native_channel="window", + ), + "window": window, + } + def _tool_type(self, args: dict[str, Any]) -> dict[str, Any]: text = str(args.get("text", "")) for char in text: @@ -603,6 +3574,14 @@ class ScreenJobAgent: parts = [self._normalize_key_name(part) for part in combo.split("+") if part.strip()] return parts + def _normalize_prohibited_key_combos(self, combos: set[str] | list[str] | tuple[str, ...] | None) -> set[str]: + normalized: set[str] = set() + for combo in combos or []: + parts = self._parse_key_combo(str(combo)) + if parts: + normalized.add("+".join(parts)) + return normalized + def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]: key = str(args.get("key", "")).strip().lower() repeats = max(1, int(args.get("repeats", 1))) @@ -612,6 +3591,18 @@ class ScreenJobAgent: combo = self._parse_key_combo(key) if not combo: return {"ok": False, "error": "Invalid key."} + combo_text = "+".join(combo) + if combo_text in self.prohibited_key_combos: + return { + "ok": False, + "blocked": True, + "key": combo_text, + "error": f"Key combo '{combo_text}' is prohibited by runtime configuration.", + "hint": ( + "Use another allowed route such as menu navigation, clicking the visible control, " + "or a different non-prohibited shortcut." + ), + } executed = 0 for _ in range(repeats): @@ -623,7 +3614,6 @@ class ScreenJobAgent: pyautogui.hotkey(*combo) executed += 1 time.sleep(0.03) - combo_text = "+".join(combo) message = "Key combo executed." if len(combo) > 1 else "Key press executed." return {"ok": True, "key": combo_text, "repeats": executed, "message": message} @@ -638,13 +3628,55 @@ class ScreenJobAgent: elapsed += interval return {"ok": True, "slept_seconds": round(seconds, 3), "message": "Sleep completed."} + def _objective_allows_output_discovery(self) -> bool: + text = self.objective.lower() + file_terms = ("file", "folder", "directory", "path", "saved", "save", "output", "artifact") + discover_terms = ("find", "locate", "reveal", "where", "show", "open", "browse") + return any(term in text for term in file_terms) and any(term in text for term in discover_terms) + + def _is_unrequested_output_discovery_command(self, command: str) -> bool: + lowered = command.lower() + patterns = ( + r"\bget-childitem\b.*\s-recurse\b", + r"\bgci\b.*\s-recurse\b", + r"\bdir\b.*\s/s\b", + r"\bwhere\b\s+/r\b", + r"\brg\b.*\s--files\b", + r"\bfindstr\b.*\s/s\b", + r"\bselect-string\b.*\b-recurse\b", + r"\bexplorer(?:\.exe)?\b.*\s/select\b", + ) + return any(re.search(pattern, lowered) for pattern in patterns) + + def _is_background_launch_command(self, command: str) -> bool: + lowered = re.sub(r"\s+", " ", str(command or "").strip().lower()) + if not lowered: + return False + launch_patterns = ( + r"^(?:cmd(?:\.exe)? /c )?start(?:\.exe)?(?: +\"[^\"]*\")? +\S+", + r"\bstart-process\b", + r"\bexplorer(?:\.exe)? +https?://", + ) + return any(re.search(pattern, lowered) for pattern in launch_patterns) + def _tool_execute_command(self, args: dict[str, Any]) -> dict[str, Any]: command = str(args.get("command", "")).strip() if not command: return {"ok": False, "error": "Empty command."} + if self._is_unrequested_output_discovery_command(command) and not self._objective_allows_output_discovery(): + return { + "ok": False, + "blocked": True, + "command": command, + "error": ( + "Recursive file-search or reveal commands are out of scope unless the objective explicitly asks " + "to find, reveal, open, or locate a file. Verify the requested task and finish instead." + ), + } started = time.time() process: subprocess.Popen[str] | None = None + background_launch_assumed = self._is_background_launch_command(command) try: process = subprocess.Popen( command, @@ -679,7 +3711,7 @@ class ScreenJobAgent: time.sleep(0.05) stdout, stderr = process.communicate(timeout=2) - return { + result = { "ok": True, "command": command, "exit_code": process.returncode, @@ -687,6 +3719,15 @@ class ScreenJobAgent: "stderr": (stderr or "")[-12000:], "elapsed_ms": int((time.time() - started) * 1000), } + if background_launch_assumed: + result["background_launch_assumed"] = True + result["focus_change_assumed"] = False + result["next_step_hint"] = ( + "Do not assume the launched app or URL is foreground. " + "Verify with get_active_window or see_screen before typing or retrying. " + "If get_active_window confirms the expected editor or dialog is focused, act directly instead of taking another screenshot first." + ) + return result except Exception as exc: # noqa: BLE001 if process is not None and process.poll() is None: try: @@ -711,10 +3752,41 @@ class ScreenJobAgent: def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]: if name in self.disabled_tools: return {"ok": False, "error": f"Tool '{name}' is disabled for this job."} + finish_likely_result = self._check_finish_likely_gate(name, args) + if finish_likely_result is not None: + return finish_likely_result + observation_loop_result = self._check_observation_loop_gate(name) + if observation_loop_result is not None: + return observation_loop_result + gate_result, policy = self._check_action_gate(name, args) + if gate_result is not None: + return gate_result handlers = { "see_screen": self._tool_see_screen, "enhance": self._tool_enhance, + "list_windows": self._tool_list_windows, + "find_window": self._tool_find_window, + "focus_window": self._tool_focus_window, + "close_window": self._tool_close_window, + "wait_for_window": self._tool_wait_for_window, + "wait_for_focus_change": self._tool_wait_for_focus_change, + "detect_dialog": self._tool_detect_dialog, + "dialog_action": self._tool_dialog_action, + "dialog_set_filename": self._tool_dialog_set_filename, + "wait_for_dialog_close": self._tool_wait_for_dialog_close, + "list_ui_elements": self._tool_list_ui_elements, + "invoke_ui_element": self._tool_invoke_ui_element, + "set_ui_element_value": self._tool_set_ui_element_value, + "select_ui_element": self._tool_select_ui_element, + "wait_for_ui_element": self._tool_wait_for_ui_element, "click": self._tool_click, + "scroll": self._tool_scroll, + "drag": self._tool_drag, + "move_mouse": self._tool_move_mouse, + "clipboard_get": self._tool_clipboard_get, + "clipboard_set": self._tool_clipboard_set, + "get_cursor_position": self._tool_get_cursor_position, + "get_active_window": self._tool_get_active_window, "type": self._tool_type, "press_key": self._tool_press_key, "sleep": self._tool_sleep, @@ -724,7 +3796,13 @@ class ScreenJobAgent: handler = handlers.get(name) if handler is None: return {"ok": False, "error": f"Unknown tool: {name}"} - return handler(args) + result = handler(args) + if name in OBSERVATION_TOOL_NAMES and bool(result.get("ok")): + self._mark_observation(name) + if policy is not None and bool(result.get("ok")): + self._note_action_attempt(policy) + result = self._decorate_verification_result(result, policy) + return self._update_finish_likely_from_tool(name, args, result) def _safe_parse_args(self, raw: str | None) -> dict[str, Any]: if not raw: @@ -760,6 +3838,22 @@ class ScreenJobAgent: y = clicked.get("y") if isinstance(x, int) and isinstance(y, int): summary = f"{summary} at=({x},{y})" + elif tool_name == "move_mouse": + moved_to = result.get("moved_to") if isinstance(result.get("moved_to"), dict) else {} + x = moved_to.get("x") + y = moved_to.get("y") + if isinstance(x, int) and isinstance(y, int): + summary = f"{summary} to=({x},{y})" + elif tool_name == "drag": + dragged_to = result.get("to") if isinstance(result.get("to"), dict) else {} + x = dragged_to.get("x") + y = dragged_to.get("y") + if isinstance(x, int) and isinstance(y, int): + summary = f"{summary} to=({x},{y})" + elif tool_name == "scroll": + amount = result.get("amount") + if isinstance(amount, int): + summary = f"{summary} amount={amount}" elif tool_name == "type": typed_length = int(result.get("typed_length", 0) or 0) summary = f"{summary} typed_length={typed_length}" @@ -767,6 +3861,37 @@ class ScreenJobAgent: key = str(result.get("key") or "").strip() if key: summary = f"{summary} key={key}" + elif tool_name == "clipboard_get": + summary = f"{summary} length={int(result.get('length', 0) or 0)}" + if bool(result.get("has_image")): + summary = f"{summary} has_image=true" + elif tool_name == "clipboard_set": + summary = f"{summary} length={int(result.get('length', 0) or 0)}" + elif tool_name == "get_cursor_position": + position = result.get("position") if isinstance(result.get("position"), dict) else {} + x = position.get("x") + y = position.get("y") + if isinstance(x, int) and isinstance(y, int): + summary = f"{summary} at=({x},{y})" + elif tool_name == "get_active_window": + window = result.get("window") if isinstance(result.get("window"), dict) else {} + title = str(window.get("title") or "").strip() + if title: + summary = f"{summary} title={title[:80]}" + elif tool_name == "list_windows": + summary = f"{summary} count={self._parse_int(result.get('count'), default=0)}" + elif tool_name in {"find_window", "focus_window", "wait_for_window", "wait_for_focus_change"}: + window = result.get("window") if isinstance(result.get("window"), dict) else {} + title = str(window.get("title") or "").strip() + if title: + summary = f"{summary} title={title[:80]}" + elif tool_name in {"detect_dialog", "dialog_action", "dialog_set_filename", "wait_for_dialog_close"}: + dialog = result.get("dialog") if isinstance(result.get("dialog"), dict) else {} + title = str(dialog.get("title") or result.get("dialog_kind") or "").strip() + if title: + summary = f"{summary} dialog={title[:80]}" + elif tool_name in {"list_ui_elements", "wait_for_ui_element"}: + summary = f"{summary} count={self._parse_int(result.get('count'), default=0)}" elif tool_name == "execute_command": exit_code = result.get("exit_code") if exit_code is not None: @@ -783,6 +3908,146 @@ class ScreenJobAgent: self.recent_tool_summaries.append(summary) self.recent_tool_summaries = self.recent_tool_summaries[-20:] + def _format_tool_call_log(self, tool_name: str, args: dict[str, Any]) -> str: + if tool_name == "execute_command": + command = str(args.get("command") or "").strip() + return f"{tool_name} command={command[:180]}" + if tool_name in {"find_window", "focus_window", "wait_for_window"}: + title = str(args.get("title_contains") or "").strip() + class_name = str(args.get("class_name") or "").strip() + return f"{tool_name} title={title[:80]} class={class_name[:80]}".strip() + if tool_name in {"dialog_action", "dialog_set_filename"}: + action = str(args.get("action") or args.get("filename") or "").strip() + return f"{tool_name} target={action[:120]}" + if tool_name in {"list_ui_elements", "wait_for_ui_element"}: + role = str(args.get("role") or "").strip() + text = str(args.get("text_contains") or "").strip() + return f"{tool_name} role={role[:40]} text={text[:80]}".strip() + if tool_name == "click": + coord = args.get("coordinate") if isinstance(args.get("coordinate"), dict) else {} + x = self._parse_int((coord or {}).get("x"), default=0) + y = self._parse_int((coord or {}).get("y"), default=0) + return f"{tool_name} at=({x},{y})" + if tool_name == "enhance": + coord = args.get("coordinate") if isinstance(args.get("coordinate"), dict) else {} + x = self._parse_int((coord or {}).get("x"), default=0) + y = self._parse_int((coord or {}).get("y"), default=0) + region = str(args.get("region") or "small").strip().lower() or "small" + mode = str(args.get("mode") or "ui").strip().lower() or "ui" + return f"{tool_name} at=({x},{y}) region={region} mode={mode}" + if tool_name == "type": + text = str(args.get("text") or "") + preview = text.replace("\n", "\\n") + return f"{tool_name} text={preview[:120]}" + if tool_name == "press_key": + key = str(args.get("key") or "").strip() + repeats = max(1, self._parse_int(args.get("repeats"), default=1)) + return f"{tool_name} key={key} repeats={repeats}" + return f"{tool_name} args={json.dumps(args, ensure_ascii=False)[:180]}" + + def _format_tool_result_log(self, tool_name: str, result: dict[str, Any]) -> str: + status = "blocked" if bool(result.get("blocked")) else ("ok" if bool(result.get("ok")) else "error") + parts = [f"{tool_name} {status}"] + if tool_name in {"see_screen", "enhance"}: + meta = result.get("meta") if isinstance(result.get("meta"), dict) else {} + path = str(meta.get("path") or result.get("path") or "").strip() + if tool_name == "see_screen": + width = self._parse_int(meta.get("width"), default=0) + height = self._parse_int(meta.get("height"), default=0) + if width > 0 and height > 0: + parts.append(f"size={width}x{height}") + if tool_name == "enhance": + source = meta.get("source_coord") if isinstance(meta.get("source_coord"), dict) else {} + parts.append( + f"at=({self._parse_int(source.get('x'), default=0)},{self._parse_int(source.get('y'), default=0)})" + ) + parts.append(f"region={str(meta.get('region') or 'small')}") + parts.append(f"mode={str(meta.get('mode') or 'ui')}") + if path: + parts.append(f"path={path}") + elif tool_name == "get_active_window": + window = result.get("window") if isinstance(result.get("window"), dict) else {} + title = str(window.get("title") or "").strip() + class_name = str(window.get("class_name") or "").strip() + if title: + parts.append(f"title={title[:120]}") + if class_name: + parts.append(f"class={class_name}") + elif tool_name in {"find_window", "focus_window", "wait_for_window", "wait_for_focus_change"}: + window = result.get("window") if isinstance(result.get("window"), dict) else {} + title = str(window.get("title") or "").strip() + if title: + parts.append(f"title={title[:120]}") + dialog_kind = str(result.get("dialog_kind") or "").strip() + if dialog_kind and dialog_kind != "none": + parts.append(f"dialog={dialog_kind}") + elif tool_name == "list_windows": + parts.append(f"count={self._parse_int(result.get('count'), default=0)}") + elif tool_name in {"detect_dialog", "dialog_action", "dialog_set_filename", "wait_for_dialog_close"}: + dialog_kind = str(result.get("dialog_kind") or "").strip() + if dialog_kind: + parts.append(f"dialog={dialog_kind}") + elif tool_name in {"list_ui_elements", "wait_for_ui_element"}: + parts.append(f"count={self._parse_int(result.get('count'), default=0)}") + elif tool_name == "type": + typed_length = self._parse_int(result.get("typed_length"), default=0) + if typed_length > 0: + parts.append(f"typed_length={typed_length}") + elif tool_name == "press_key": + key = str(result.get("key") or "").strip() + if key: + parts.append(f"key={key}") + elif tool_name == "click": + clicked = result.get("clicked") if isinstance(result.get("clicked"), dict) else {} + x = self._parse_int(clicked.get("x"), default=0) + y = self._parse_int(clicked.get("y"), default=0) + if x or y: + parts.append(f"clicked=({x},{y})") + elif tool_name == "execute_command": + exit_code = result.get("exit_code") + if exit_code is not None: + parts.append(f"exit_code={exit_code}") + if bool(result.get("background_launch_assumed")): + parts.append("background_launch_assumed=true") + verification_channels = result.get("verification_channels") + if isinstance(verification_channels, list) and verification_channels: + parts.append("verify=" + ",".join(str(item) for item in verification_channels)) + recommended_next_tools = result.get("recommended_next_tools") + if isinstance(recommended_next_tools, list) and recommended_next_tools: + parts.append("next=" + ",".join(str(item) for item in recommended_next_tools[:4])) + completion_evidence = result.get("completion_evidence") + if isinstance(completion_evidence, list) and completion_evidence: + summaries = [str(item.get("summary") or "").strip() for item in completion_evidence if isinstance(item, dict)] + summaries = [summary for summary in summaries if summary] + if summaries: + parts.append("evidence=" + " ; ".join(summaries[:2])) + finish_likely = result.get("finish_likely") + if isinstance(finish_likely, dict): + summary = str(finish_likely.get("summary") or "").strip() + if summary: + parts.append(f"finish_likely={summary[:180]}") + if bool(result.get("finish_likely_verification_done")): + parts.append("finish_verification_done=true") + error_text = str(result.get("error") or "").strip() + if error_text: + parts.append(f"error={error_text[:180]}") + return " | ".join(parts) + + def _log_tool_call(self, tool_name: str, args: dict[str, Any]) -> None: + self.logger.info("Tool call: %s", self._format_tool_call_log(tool_name, args)) + if self.options.pretty_logs: + self.logger.debug("Tool call detail (%s):\n%s", tool_name, json.dumps(args, ensure_ascii=False, indent=2)) + + def _log_tool_result(self, tool_name: str, result: dict[str, Any]) -> None: + log_level = logging.INFO if bool(result.get("ok")) else logging.WARNING + self.logger.log(log_level, "Tool result: %s", self._format_tool_result_log(tool_name, result)) + if self.options.pretty_logs: + self.logger.debug( + "Tool result detail (%s):\n%s", + tool_name, + json.dumps(result, ensure_ascii=False, indent=2), + ) + def _should_compact_context(self) -> bool: interval = max(0, int(self.options.screen_context_decay_steps or 0)) if interval <= 0: @@ -791,17 +4056,25 @@ class ScreenJobAgent: return False return (self.step - self.last_context_compact_step) >= interval - def _build_compacted_pending_input(self) -> list[dict[str, Any]]: - recent = self.recent_tool_summaries[-8:] - lines = "\n".join(f"- {line}" for line in recent) if recent else "- No recent tool activity." - content = ( - "Context compaction activated to decay stale screenshots and reduce token usage.\n" - f"JOB: {self.objective}\n" - f"Current step: {self.step}\n" - "Recent tool activity:\n" - f"{lines}\n" - "Continue execution from the latest screen state. " - "Use tools only, and finish with task_complete when done." + def _rebuild_reason(self) -> str | None: + if self.visual_context_overflow_pending: + return "visual_budget" + if self._should_compact_context(): + return "decay" + return None + + def _build_compacted_pending_input( + self, + rebuild_reason: str, + carryover_items: list[dict[str, Any]] | None = None, + ) -> list[dict[str, Any]]: + content = build_context_compaction_prompt( + self.objective, + self.step, + self.recent_tool_summaries, + rebuild_reason, + self._latest_visual_context_summaries(), + self._sorted_prohibited_key_combos(), ) compacted_input: list[dict[str, Any]] = [ { @@ -814,22 +4087,83 @@ class ScreenJobAgent: ], } ] - if self.last_screen_data_url and self.last_screen_meta: - compacted_input.append( - self._build_visual_message( - "Current screen after context compaction", - self.last_screen_data_url, - self.last_screen_meta, - ) - ) + carryover_user_items: list[dict[str, Any]] = [] + for item in carryover_items or []: + if str(item.get("type") or "") == "function_call_output": + continue + if self._is_visual_input_message(item): + continue + if str(item.get("role") or "") != "user": + continue + content_items = item.get("content") + if isinstance(content_items, list): + text_parts = [ + str(part.get("text") or "").strip() + for part in content_items + if isinstance(part, dict) and str(part.get("type") or "") == "input_text" + ] + joined_text = "\n".join(part for part in text_parts if part) + if joined_text.startswith("Context compaction activated due to"): + continue + carryover_user_items.append(item) + compacted_input.extend(carryover_user_items[-2:]) + compacted_input.extend( + [entry["message"] for entry in self._latest_visual_context_entries(self.visual_context_messages)] + ) return compacted_input + def _emit_context_compacted(self, rebuild_reason: str) -> None: + retained_paths = [ + str((entry.get("meta") or {}).get("path") or "") + for entry in self._latest_visual_context_entries(self.visual_context_messages) + ] + self.logger.info( + "Compacted model context at step %d due to %s. retained_visuals=%d", + self.step, + rebuild_reason, + len(retained_paths), + ) + if self.options.pretty_logs and retained_paths: + self.logger.debug( + "Retained visual context after compaction:\n%s", + json.dumps(retained_paths, ensure_ascii=False, indent=2), + ) + self._emit( + "context_compacted", + { + "step": self.step, + "decay_steps": self.options.screen_context_decay_steps, + "rebuild_reason": rebuild_reason, + "recent_tool_summaries": self.recent_tool_summaries[-8:], + "visual_context_count": len(self.visual_context_messages), + "visual_context_paths": retained_paths, + }, + ) + self.visual_context_overflow_pending = False + self.last_context_compact_step = self.step + def run(self, job: str) -> AgentResult: self.objective = job + self.completion_evidence = {} + self.last_observed_window = None + self.finish_likely_state.update( + { + "active": False, + "activated_at_step": 0, + "target_filename": self._infer_target_filename(job), + "summary": "", + "fresh_verification_done": False, + "verification_step": 0, + "post_completion_visual_signature": "", + } + ) started_at = time.time() self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model) self.logger.info("Job: %s", job) self.logger.info("Disabled tools: %s", sorted(self.disabled_tools)) + if self.prohibited_key_combos: + self.logger.info("Prohibited key combos: %s", self._sorted_prohibited_key_combos()) + self.logger.info("Pretty logs: %s", self.options.pretty_logs) self._emit( "job_started", { @@ -837,8 +4171,11 @@ class ScreenJobAgent: "model": self.options.model, "reasoning_effort": self.options.reasoning_effort, "screen_context_decay_steps": self.options.screen_context_decay_steps, + "max_visual_context_images": self._max_visual_context_images(), + "pretty_logs": self.options.pretty_logs, "objective": job, "disabled_tools": sorted(self.disabled_tools), + "prohibited_key_combos": self._sorted_prohibited_key_combos(), }, ) @@ -849,25 +4186,22 @@ class ScreenJobAgent: "content": [ { "type": "input_text", - "text": ( - f"JOB: {job}\n" - "You are in an action loop. Prefer execute_command for deterministic actions. " - "For modifier shortcuts, use a single press_key combo (example: win+r). " - "Before clicking tiny buttons/icons or dense UI areas, call enhance first " - "(use region='small'; use mode='text' for tiny text labels). " - "You can return multiple tool calls in one step (example: click then sleep). " - "When done call task_complete(return=..., data=...). " - "Before task_complete, verify the screen content is what was expected " - "using see_screen/enhance and include observed_result in data. " - "Include useful structured output in data." - ), + "text": build_initial_action_prompt(job, self._sorted_prohibited_key_combos()), } ], } ] if self.last_screen_data_url and self.last_screen_meta: - init_input.append( - self._build_visual_message("Initial screen capture", self.last_screen_data_url, self.last_screen_meta) + visual_message = self._build_visual_message( + "Initial screen capture", + self.last_screen_data_url, + self.last_screen_meta, + ) + init_input.append(visual_message) + self._register_visual_context_message( + visual_message, + self.last_screen_meta, + tool_name="see_screen", ) pending_input = init_input @@ -883,19 +4217,11 @@ class ScreenJobAgent: self.step += 1 self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps) self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps}) - if self._should_compact_context(): + rebuild_reason = self._rebuild_reason() + if rebuild_reason is not None: self.previous_response_id = None - pending_input = self._build_compacted_pending_input() - self.last_context_compact_step = self.step - self.logger.info("Compacted model context at step %d.", self.step) - self._emit( - "context_compacted", - { - "step": self.step, - "decay_steps": self.options.screen_context_decay_steps, - "recent_tool_summaries": self.recent_tool_summaries[-8:], - }, - ) + pending_input = self._build_compacted_pending_input(rebuild_reason, pending_input) + self._emit_context_compacted(rebuild_reason) try: response = self._call_model(pending_input) self._register_usage(response) @@ -921,16 +4247,7 @@ class ScreenJobAgent: "content": [ { "type": "input_text", - "text": ( - "No function call was returned. Continue by using tools. " - "Use one press_key call for key combos like win+r. " - "Prefer enhance before clicking small/unclear targets " - "(region='small', mode='ui' or 'text'). " - "You may call multiple tools in one step. " - "Before task_complete, verify expected screen content with see_screen/enhance " - "and include observed_result in data. " - "When complete, call task_complete(return=..., data=...)." - ), + "text": build_no_tool_prompt(self._sorted_prohibited_key_combos()), } ], } @@ -938,6 +4255,8 @@ class ScreenJobAgent: continue next_input: list[dict[str, Any]] = [] + step_tool_names: list[str] = [] + step_active_window: dict[str, Any] | None = None for tool_call in tool_calls: if self._is_cancelled(): cancelled = True @@ -949,7 +4268,8 @@ class ScreenJobAgent: args_raw = getattr(tool_call, "arguments", "{}") args = self._safe_parse_args(args_raw) - self.logger.info("Tool call: %s args=%s", name, json.dumps(args, ensure_ascii=False)) + self._log_tool_call(name, args) + step_tool_names.append(name) self._emit( "tool_called", {"step": self.step, "tool": name, "args": args}, @@ -964,25 +4284,59 @@ class ScreenJobAgent: "traceback": traceback.format_exc()[-8000:], } - self.logger.debug( - "Tool result for %s: %s", - name, - json.dumps(result, ensure_ascii=False)[:2500], - ) + self._log_tool_result(name, result) self._record_tool_summary(name, result) self._emit("tool_result", {"step": self.step, "tool": name, "result": result}) + if name == "get_active_window" and bool(result.get("ok")) and isinstance(result.get("window"), dict): + step_active_window = dict(result["window"]) next_input.append( { "type": "function_call_output", "call_id": call_id, "output": json.dumps(result, ensure_ascii=False), } - ) + ) + + if bool(result.get("blocked")): + blocked_hint = str(result.get("hint") or "").strip() + blocked_prompt = build_blocked_action_prompt( + name, + blocked_hint, + self._sorted_prohibited_key_combos(), + ) + if str(result.get("blocked_reason") or "") == "observation_loop": + blocked_prompt = build_observation_loop_prompt( + str(result.get("window_summary") or "").strip() or None, + self._parse_int(result.get("repeated_steps"), default=MAX_STABLE_OBSERVATION_STEPS), + self._sorted_prohibited_key_combos(), + ) + if str(result.get("blocked_reason") or "") == "finish_likely": + blocked_prompt = build_finish_likely_prompt( + str(result.get("evidence_summary") or "").strip() or self._completion_summary(), + verification_done=bool(result.get("verification_done")), + prohibited_key_combos=self._sorted_prohibited_key_combos(), + ) + next_input.append( + { + "role": "user", + "content": [ + { + "type": "input_text", + "text": blocked_prompt, + } + ], + } + ) if name in ("see_screen", "enhance") and self.last_screen_data_url and self.last_screen_meta: title = "Updated screen capture" if name == "see_screen" else "Enhanced screen region" - next_input.append( - self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta) + visual_message = self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta) + next_input.append(visual_message) + self._register_visual_context_message( + visual_message, + self.last_screen_meta, + tool_name=name, + result=result, ) self._emit( "visual_update", @@ -995,6 +4349,23 @@ class ScreenJobAgent: if cancelled: break + self._record_step_history(step_tool_names, step_active_window) + if bool(self.finish_likely_state.get("active")): + next_input.append( + { + "role": "user", + "content": [ + { + "type": "input_text", + "text": build_finish_likely_prompt( + str(self.finish_likely_state.get("summary") or "").strip() or self._completion_summary(), + verification_done=bool(self.finish_likely_state.get("fresh_verification_done")), + prohibited_key_combos=self._sorted_prohibited_key_combos(), + ), + } + ], + } + ) pending_input = next_input ended_at = time.time() diff --git a/src/app_main.py b/src/app_main.py index 8ef6fa5..b061623 100644 --- a/src/app_main.py +++ b/src/app_main.py @@ -30,6 +30,7 @@ def main(argv: list[str] | None = None) -> int: print(" OPENAI_API_KEY=...") print(" SCREENJOB_TOKEN=...") print(" DISABLE_UI=true|false (optional)") + print(" SCREENJOB_PROHIBITED_KEY_COMBOS=ctrl+shift+s,alt+f4 (optional)") return 0 server.main() return 0 diff --git a/src/cli.py b/src/cli.py index ecccbe4..f299798 100644 --- a/src/cli.py +++ b/src/cli.py @@ -5,6 +5,7 @@ import json import sys from pathlib import Path +from .agent import normalize_disabled_tools from .config import load_app_config from .models import RuntimeOptions from .runtime import create_openai_client, run_job @@ -40,8 +41,55 @@ def build_parser() -> argparse.ArgumentParser: default=4, help="Compact model context every N steps to decay old screenshots (0 disables).", ) + parser.add_argument( + "--max-visual-context-images", + type=int, + default=3, + help="Maximum screenshots/enhanced images retained in model-visible context during rebases.", + ) + parser.add_argument( + "--native-automation-mode", + choices=["off", "prefer", "require_fallback"], + default="prefer", + help="How strongly the agent should prefer Windows-native automation helpers over pixel fallback.", + ) + parser.add_argument( + "--dialog-timeout-seconds", + type=float, + default=12.0, + help="Timeout for dialog-oriented waits and retries.", + ) + parser.add_argument( + "--focus-timeout-seconds", + type=float, + default=8.0, + help="Timeout for focus-change waits and verification.", + ) + parser.add_argument( + "--ui-element-timeout-seconds", + type=float, + default=8.0, + help="Timeout for native UI element lookup waits.", + ) + parser.add_argument( + "--max-retries-per-surface", + type=int, + default=3, + help="Maximum repeated retries on the same classified window/dialog surface before the agent must pivot.", + ) + parser.add_argument( + "--pretty-logs", + action="store_true", + help="Emit expanded multi-line tool call/result logs for easier debugging.", + ) parser.add_argument("--disable-tool", action="append", default=[], help="Disable a tool by name.") - parser.add_argument("--skip-safety-check", action="store_true", help="Bypass pre-flight safety check.") + parser.add_argument( + "--skip-safety-check", + "--skip-safety-chec", + dest="skip_safety_check", + action="store_true", + help="Bypass pre-flight safety check.", + ) parser.add_argument("--no-failsafe", action="store_true", help="Disable PyAutoGUI fail-safe.") return parser @@ -57,7 +105,10 @@ def main(argv: list[str] | None = None) -> int: return 2 model = args.model or config.default_model - disabled_tools = sorted({str(x).strip() for x in args.disable_tool if str(x).strip()}) + try: + disabled_tools = normalize_disabled_tools(args.disable_tool) + except ValueError as exc: + parser.error(str(exc)) if not args.skip_safety_check: safety_client = create_openai_client(config.openai_api_key) @@ -92,7 +143,15 @@ def main(argv: list[str] | None = None) -> int: click_pause=args.click_pause, reasoning_effort=args.reasoning_effort, screen_context_decay_steps=max(0, int(args.screen_context_decay_steps)), + max_visual_context_images=max(0, int(args.max_visual_context_images)), + native_automation_mode=args.native_automation_mode, + dialog_timeout_seconds=max(0.5, float(args.dialog_timeout_seconds)), + focus_timeout_seconds=max(0.5, float(args.focus_timeout_seconds)), + ui_element_timeout_seconds=max(0.5, float(args.ui_element_timeout_seconds)), + max_retries_per_surface=max(1, int(args.max_retries_per_surface)), + pretty_logs=bool(args.pretty_logs), disable_tools=set(disabled_tools), + prohibited_key_combos=set(config.prohibited_key_combos), ) try: result, artifacts = run_job( diff --git a/src/config.py b/src/config.py index ed12c2e..306537f 100644 --- a/src/config.py +++ b/src/config.py @@ -14,6 +14,13 @@ def _env_bool(name: str, default: bool = False) -> bool: return raw.strip().lower() in {"1", "true", "yes", "on"} +def _env_csv(name: str) -> list[str]: + raw = os.getenv(name) + if raw is None: + return [] + return [item.strip() for item in raw.split(",") if item.strip()] + + @dataclass(frozen=True) class AppConfig: openai_api_key: str @@ -25,6 +32,7 @@ class AppConfig: port: int runs_dir: Path db_path: Path + prohibited_key_combos: tuple[str, ...] = () def load_app_config(cwd: Path) -> AppConfig: @@ -38,6 +46,7 @@ def load_app_config(cwd: Path) -> AppConfig: runs_dir = cwd / "screenjob_runs" db_path = cwd / "screenjob.db" disable_ui = _env_bool("DISABLE_UI", default=False) + prohibited_key_combos = tuple(_env_csv("SCREENJOB_PROHIBITED_KEY_COMBOS")) return AppConfig( openai_api_key=openai_api_key, screenjob_token=screenjob_token, @@ -48,5 +57,5 @@ def load_app_config(cwd: Path) -> AppConfig: port=port, runs_dir=runs_dir, db_path=db_path, + prohibited_key_combos=prohibited_key_combos, ) - diff --git a/src/models.py b/src/models.py index 22bdfc9..55e2fbb 100644 --- a/src/models.py +++ b/src/models.py @@ -60,4 +60,12 @@ class RuntimeOptions: click_pause: float = 0.10 reasoning_effort: str = "medium" screen_context_decay_steps: int = 4 + max_visual_context_images: int = 3 + native_automation_mode: str = "prefer" + dialog_timeout_seconds: float = 12.0 + focus_timeout_seconds: float = 8.0 + ui_element_timeout_seconds: float = 8.0 + max_retries_per_surface: int = 3 + pretty_logs: bool = False disable_tools: set[str] | None = None + prohibited_key_combos: set[str] | None = None diff --git a/src/server.py b/src/server.py index 9c01d67..f3b81ac 100644 --- a/src/server.py +++ b/src/server.py @@ -12,6 +12,7 @@ from fastapi.responses import FileResponse from fastapi.responses import HTMLResponse, JSONResponse from pydantic import BaseModel, Field +from .agent import normalize_disabled_tools from .config import AppConfig, load_app_config from .storage import HistoryDB from .task_manager import JobManager @@ -28,6 +29,13 @@ class CreateJobRequest(BaseModel): click_pause: float = Field(0.10, ge=0.0, le=2.0) reasoning_effort: str = Field("medium", pattern="^(low|medium|high)$") screen_context_decay_steps: int = Field(4, ge=0, le=50) + max_visual_context_images: int = Field(3, ge=0, le=12) + native_automation_mode: str = Field("prefer", pattern="^(off|prefer|require_fallback)$") + dialog_timeout_seconds: float = Field(12.0, ge=0.5, le=120.0) + focus_timeout_seconds: float = Field(8.0, ge=0.5, le=120.0) + ui_element_timeout_seconds: float = Field(8.0, ge=0.5, le=120.0) + max_retries_per_surface: int = Field(3, ge=1, le=10) + pretty_logs: bool = False disabled_tools: list[str] = Field(default_factory=list) safety_override: bool = False no_failsafe: bool = False @@ -297,19 +305,30 @@ def create_app(config: AppConfig | None = None) -> FastAPI: @app.post("/api/jobs") def create_job(payload: CreateJobRequest, _: None = Depends(require_token)) -> dict[str, str]: - job_id = manager.submit_job( - objective=payload.job, - model=payload.model, - max_steps=payload.max_steps, - command_timeout=payload.command_timeout, - type_interval=payload.type_interval, - click_pause=payload.click_pause, - reasoning_effort=payload.reasoning_effort, - screen_context_decay_steps=payload.screen_context_decay_steps, - disabled_tools=payload.disabled_tools, - safety_override=payload.safety_override, - no_failsafe=payload.no_failsafe, - ) + try: + disabled_tools = normalize_disabled_tools(payload.disabled_tools) + job_id = manager.submit_job( + objective=payload.job, + model=payload.model, + max_steps=payload.max_steps, + command_timeout=payload.command_timeout, + type_interval=payload.type_interval, + click_pause=payload.click_pause, + reasoning_effort=payload.reasoning_effort, + screen_context_decay_steps=payload.screen_context_decay_steps, + max_visual_context_images=payload.max_visual_context_images, + native_automation_mode=payload.native_automation_mode, + dialog_timeout_seconds=payload.dialog_timeout_seconds, + focus_timeout_seconds=payload.focus_timeout_seconds, + ui_element_timeout_seconds=payload.ui_element_timeout_seconds, + max_retries_per_surface=payload.max_retries_per_surface, + pretty_logs=payload.pretty_logs, + disabled_tools=disabled_tools, + safety_override=payload.safety_override, + no_failsafe=payload.no_failsafe, + ) + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc return {"job_id": job_id} @app.get("/api/jobs") diff --git a/src/task_manager.py b/src/task_manager.py index 7fc5c2e..92d0f34 100644 --- a/src/task_manager.py +++ b/src/task_manager.py @@ -8,7 +8,9 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Callable +from .agent import normalize_disabled_tools from .config import AppConfig +from .desktop_overlay import DesktopOverlayManager, get_desktop_overlay_manager from .models import RuntimeOptions from .runtime import create_openai_client, run_job from .safety import assess_task_safety @@ -32,10 +34,12 @@ class JobManager: config: AppConfig, db: HistoryDB, broadcast: Callable[[dict[str, Any]], None] | None = None, + overlay_manager: DesktopOverlayManager | None = None, ) -> None: self.config = config self.db = db self.broadcast = broadcast + self.overlay_manager = overlay_manager or get_desktop_overlay_manager() self._running: dict[str, _RunningJob] = {} self._lock = threading.Lock() @@ -50,6 +54,13 @@ class JobManager: click_pause: float = 0.10, reasoning_effort: str = "medium", screen_context_decay_steps: int = 4, + max_visual_context_images: int = 3, + native_automation_mode: str = "prefer", + dialog_timeout_seconds: float = 12.0, + focus_timeout_seconds: float = 8.0, + ui_element_timeout_seconds: float = 8.0, + max_retries_per_surface: int = 3, + pretty_logs: bool = False, disabled_tools: list[str] | None = None, safety_override: bool = False, no_failsafe: bool = False, @@ -57,7 +68,7 @@ class JobManager: job_id = f"job_{int(time.time())}_{uuid.uuid4().hex[:8]}" created_at = utc_now_iso() selected_model = (model or self.config.default_model).strip() or self.config.default_model - disabled = sorted({tool.strip() for tool in (disabled_tools or []) if tool.strip()}) + disabled = normalize_disabled_tools(disabled_tools) self.db.create_job( job_id=job_id, objective=objective, @@ -97,6 +108,13 @@ class JobManager: "click_pause": click_pause, "reasoning_effort": reasoning_effort, "screen_context_decay_steps": screen_context_decay_steps, + "max_visual_context_images": max_visual_context_images, + "native_automation_mode": native_automation_mode, + "dialog_timeout_seconds": dialog_timeout_seconds, + "focus_timeout_seconds": focus_timeout_seconds, + "ui_element_timeout_seconds": ui_element_timeout_seconds, + "max_retries_per_surface": max_retries_per_surface, + "pretty_logs": pretty_logs, "no_failsafe": no_failsafe, "cancel_event": cancel_event, }, @@ -127,6 +145,13 @@ class JobManager: click_pause: float, reasoning_effort: str, screen_context_decay_steps: int, + max_visual_context_images: int, + native_automation_mode: str, + dialog_timeout_seconds: float, + focus_timeout_seconds: float, + ui_element_timeout_seconds: float, + max_retries_per_surface: int, + pretty_logs: bool, no_failsafe: bool, cancel_event: threading.Event, ) -> None: @@ -226,7 +251,15 @@ class JobManager: click_pause=click_pause, reasoning_effort=reasoning_effort, screen_context_decay_steps=max(0, int(screen_context_decay_steps)), + max_visual_context_images=max(0, int(max_visual_context_images)), + native_automation_mode=str(native_automation_mode or "prefer").strip().lower() or "prefer", + dialog_timeout_seconds=max(0.5, float(dialog_timeout_seconds)), + focus_timeout_seconds=max(0.5, float(focus_timeout_seconds)), + ui_element_timeout_seconds=max(0.5, float(ui_element_timeout_seconds)), + max_retries_per_surface=max(1, int(max_retries_per_surface)), + pretty_logs=bool(pretty_logs), disable_tools=set(disabled_tools), + prohibited_key_combos=set(self.config.prohibited_key_combos), ) try: result, artifacts = run_job( @@ -297,6 +330,14 @@ class JobManager: }, }, ) + if status == "completed": + self.overlay_manager.show_completion( + job_id=job_id, + objective=objective, + return_message=result.return_message, + steps=result.steps, + elapsed_seconds=max(0.0, float(result.ended_at - result.started_at)), + ) with self._lock: self._running.pop(job_id, None) diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py index d83beaa..fd8db08 100644 --- a/tests/test_agent_tools.py +++ b/tests/test_agent_tools.py @@ -1,8 +1,11 @@ from __future__ import annotations +import json import logging from pathlib import Path +from typing import Any +import pytest from PIL import Image import src.agent as agent_module @@ -15,8 +18,12 @@ class _DummyPyAutoGUI: def __init__(self) -> None: self.last_move_to: tuple[int, int] | None = None - self.last_click: tuple[int, int] | None = None + self.last_move_duration: float | None = None + self.last_click: dict[str, object] | None = None self.last_hotkey: tuple[str, ...] | None = None + self.last_drag_to: dict[str, object] | None = None + self.last_scroll: int | None = None + self.current_position: tuple[int, int] = (640, 360) def screenshot(self) -> Image.Image: return Image.new("RGB", (1280, 720), color=(24, 24, 24)) @@ -26,9 +33,26 @@ class _DummyPyAutoGUI: def moveTo(self, x: int, y: int, duration: float = 0.0) -> None: # noqa: N802 self.last_move_to = (x, y) + self.last_move_duration = duration + self.current_position = (x, y) - def click(self, x: int, y: int) -> None: - self.last_click = (x, y) + def click( + self, + x: int, + y: int, + clicks: int = 1, + interval: float = 0.0, + button: str = "left", + ) -> None: + self.last_click = {"x": x, "y": y, "clicks": clicks, "interval": interval, "button": button} + self.current_position = (x, y) + + def dragTo(self, x: int, y: int, duration: float = 0.0, button: str = "left") -> None: # noqa: N802 + self.last_drag_to = {"x": x, "y": y, "duration": duration, "button": button} + self.current_position = (x, y) + + def scroll(self, amount: int) -> None: + self.last_scroll = amount def write(self, _: str, interval: float = 0.0) -> None: return None @@ -39,6 +63,10 @@ class _DummyPyAutoGUI: def hotkey(self, *keys: str) -> None: self.last_hotkey = tuple(keys) + def position(self): + x, y = self.current_position + return type("Point", (), {"x": x, "y": y})() + def _build_agent(tmp_path: Path, monkeypatch) -> agent_module.ScreenJobAgent: dummy_gui = _DummyPyAutoGUI() @@ -84,11 +112,158 @@ def test_click_supports_directional_offsets(tmp_path: Path, monkeypatch) -> None "offset_up": "2px", "offset_right": 7, "offset": {"x": 3, "y": 4}, + "button": "right", + "click_count": 2, + "interval_seconds": "0.5s", + "duration_seconds": "0.2s", "sleep_after_seconds": 0, } ) assert click_result["ok"] is True assert click_result["clicked"] == {"x": 110, "y": 102} + assert click_result["button"] == "right" + assert click_result["click_count"] == 2 + assert click_result["interval_seconds"] == 0.5 + assert click_result["duration_seconds"] == 0.2 + assert agent_module.pyautogui.last_click == { + "x": 110, + "y": 102, + "clicks": 2, + "interval": 0.5, + "button": "right", + } + + +def test_scroll_supports_direction_and_amount(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + result = agent._tool_scroll( + { + "amount": 8, + "direction": "down", + "coordinate": {"x": 1400, "y": -5}, + "sleep_after_seconds": 0, + } + ) + + assert result["ok"] is True + assert result["amount"] == -8 + assert result["direction"] == "down" + assert result["moved_to"] == {"x": 1279, "y": 0} + assert agent_module.pyautogui.last_scroll == -8 + + +def test_drag_translates_coordinates_and_button(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + result = agent._tool_drag( + { + "start_coordinate": {"x": -10, "y": 100}, + "end_coordinate": {"x": 1285, "y": 800}, + "button": "middle", + "duration_seconds": "0.3s", + "sleep_after_seconds": 0, + } + ) + + assert result["ok"] is True + assert result["from"] == {"x": 0, "y": 100} + assert result["to"] == {"x": 1279, "y": 719} + assert result["button"] == "middle" + assert result["duration_seconds"] == 0.3 + assert agent_module.pyautogui.last_drag_to == { + "x": 1279, + "y": 719, + "duration": 0.3, + "button": "middle", + } + + +def test_move_mouse_clamps_target_coordinate(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + result = agent._tool_move_mouse({"coordinate": {"x": 1500, "y": -5}, "duration_seconds": "0.4s"}) + + assert result["ok"] is True + assert result["moved_to"] == {"x": 1279, "y": 0} + assert result["duration_seconds"] == 0.4 + assert agent_module.pyautogui.last_move_to == (1279, 0) + + +def test_clipboard_get_and_set_round_trip(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + state = {"text": ""} + monkeypatch.setattr(agent, "_clipboard_set_text", lambda text: state.__setitem__("text", text)) + monkeypatch.setattr(agent, "_clipboard_get_text", lambda: state["text"]) + monkeypatch.setattr( + agent, + "_clipboard_get_metadata", + lambda: {"has_text": bool(state["text"]), "has_image": True, "available_formats": ["CF_UNICODETEXT", "CF_DIB"]}, + ) + + set_result = agent._tool_clipboard_set({"text": "hello clipboard"}) + get_result = agent._tool_clipboard_get({}) + + assert set_result["ok"] is True + assert set_result["length"] == 15 + assert get_result["ok"] is True + assert get_result["text"] == "hello clipboard" + assert get_result["length"] == 15 + assert get_result["has_text"] is True + assert get_result["has_image"] is True + assert get_result["available_formats"] == ["CF_UNICODETEXT", "CF_DIB"] + + +def test_clipboard_set_falls_back_to_powershell_when_native_path_fails(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + state = {"text": ""} + + def fail_native(_: str) -> None: + raise OSError("[WinError 6] The handle is invalid.") + + def shell_fallback(text: str) -> None: + state["text"] = text + + monkeypatch.setattr(agent, "_clipboard_set_text", fail_native) + monkeypatch.setattr(agent, "_clipboard_set_text_via_shell", shell_fallback) + + result = agent._tool_clipboard_set({"text": "Example Domain"}) + + assert result["ok"] is True + assert result["used_shell_fallback"] is True + assert "WinError 6" in result["native_error"] + assert state["text"] == "Example Domain" + + +def test_get_cursor_position_returns_current_mouse_location(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent_module.pyautogui.current_position = (321, 654) + + result = agent._tool_get_cursor_position({}) + + assert result["ok"] is True + assert result["position"] == {"x": 321, "y": 654} + + +def test_get_active_window_returns_metadata_shape(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + monkeypatch.setattr( + agent, + "_get_active_window_info", + lambda: { + "available": True, + "hwnd": 1234, + "title": "Settings", + "class_name": "ApplicationFrameWindow", + "thread_id": 44, + "process_id": 77, + "is_visible": True, + "rect": {"left": 10, "top": 20, "right": 410, "bottom": 320, "width": 400, "height": 300}, + }, + ) + + result = agent._tool_get_active_window({}) + + assert result["ok"] is True + assert result["window"]["title"] == "Settings" + assert result["window"]["rect"]["width"] == 400 def test_enhance_defaults_to_small_ui_preset(tmp_path: Path, monkeypatch) -> None: @@ -135,6 +310,32 @@ def test_press_key_supports_hotkey_combo(tmp_path: Path, monkeypatch) -> None: assert agent_module.pyautogui.last_hotkey == ("win", "r") +def test_press_key_blocks_prohibited_combo(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.options.prohibited_key_combos = {"ctrl+shift+s"} + agent.prohibited_key_combos = agent._normalize_prohibited_key_combos(agent.options.prohibited_key_combos) + + result = agent._tool_press_key({"key": "ctrl+shift+s"}) + + assert result["ok"] is False + assert result["blocked"] is True + assert result["key"] == "ctrl+shift+s" + assert "prohibited by runtime configuration" in result["error"] + assert "another allowed route" in result["hint"] + + +def test_press_key_blocks_prohibited_combo_after_alias_normalization(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.options.prohibited_key_combos = {"meta+r"} + agent.prohibited_key_combos = agent._normalize_prohibited_key_combos(agent.options.prohibited_key_combos) + + result = agent._tool_press_key({"key": "win+r"}) + + assert result["ok"] is False + assert result["blocked"] is True + assert result["key"] == "win+r" + + def test_context_compaction_trigger_and_payload(tmp_path: Path, monkeypatch) -> None: agent = _build_agent(tmp_path, monkeypatch) agent.objective = "Open settings app" @@ -147,7 +348,596 @@ def test_context_compaction_trigger_and_payload(tmp_path: Path, monkeypatch) -> agent.last_screen_meta = {"width": 1280, "height": 720, "path": "C:/tmp/frame.png"} assert agent._should_compact_context() is True - compacted = agent._build_compacted_pending_input() + visual_message = agent._build_visual_message("Current screen", "data:image/png;base64,abc", agent.last_screen_meta) + agent._register_visual_context_message(visual_message, agent.last_screen_meta, tool_name="see_screen") + compacted = agent._build_compacted_pending_input("decay") assert len(compacted) == 2 - assert "Context compaction activated" in compacted[0]["content"][0]["text"] + assert "Context compaction activated due to stale context decay." in compacted[0]["content"][0]["text"] assert "Open settings app" in compacted[0]["content"][0]["text"] + assert "Treat prior reasoning as stale" in compacted[0]["content"][0]["text"] + assert "Retained visual observations:" in compacted[0]["content"][0]["text"] + assert "do not call see_screen again only because compaction happened" in compacted[0]["content"][0]["text"] + assert "observe -> decide -> act -> verify" in compacted[0]["content"][0]["text"] + + +def test_context_compaction_drops_function_call_outputs_from_rebased_input(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.objective = "Open settings app" + visual_meta = {"path": "C:/tmp/frame.png"} + visual_message = agent._build_visual_message("Current screen", "data:image/png;base64,abc", visual_meta) + agent._register_visual_context_message(visual_message, visual_meta, tool_name="see_screen") + + compacted = agent._build_compacted_pending_input( + "decay", + carryover_items=[ + {"type": "function_call_output", "call_id": "call_123", "output": "{\"ok\": true}"}, + {"role": "user", "content": [{"type": "input_text", "text": "blocked hint"}]}, + ], + ) + + assert len(compacted) == 3 + assert compacted[1]["role"] == "user" + assert compacted[1]["content"][0]["text"] == "blocked hint" + assert all(item.get("type") != "function_call_output" for item in compacted) + + +def test_visual_context_budget_keeps_only_latest_three_images(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.options.max_visual_context_images = 3 + + captured_times = [ + "2026-05-30T10:00:03+00:00", + "2026-05-30T10:00:01+00:00", + "2026-05-30T10:00:04+00:00", + "2026-05-30T10:00:02+00:00", + ] + for idx, captured_at in enumerate(captured_times): + meta = {"path": f"C:/tmp/frame_{idx}.png", "captured_at": captured_at} + message = agent._build_visual_message(f"frame {idx}", f"data:image/png;base64,{idx}", meta) + agent._register_visual_context_message(message, meta, tool_name="see_screen") + + assert agent.visual_context_overflow_pending is True + assert [entry["meta"]["path"] for entry in agent.visual_context_messages] == [ + "C:/tmp/frame_3.png", + "C:/tmp/frame_0.png", + "C:/tmp/frame_2.png", + ] + + +def test_compacted_input_uses_latest_visuals_by_capture_time(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.options.max_visual_context_images = 3 + agent.objective = "Verify the current app window" + + for idx, captured_at in enumerate( + [ + "2026-05-30T10:00:04+00:00", + "2026-05-30T10:00:01+00:00", + "2026-05-30T10:00:03+00:00", + "2026-05-30T10:00:02+00:00", + ] + ): + meta = {"path": f"C:/tmp/frame_{idx}.png", "captured_at": captured_at} + message = agent._build_visual_message(f"frame {idx}", f"data:image/png;base64,{idx}", meta) + agent._register_visual_context_message(message, meta, tool_name="see_screen") + + compacted = agent._build_compacted_pending_input("visual_budget") + visual_messages = [ + item + for item in compacted + if isinstance(item.get("content"), list) + and any(part.get("type") == "input_image" for part in item["content"] if isinstance(part, dict)) + ] + + assert len(visual_messages) == 3 + assert [ + json.loads(message["content"][0]["text"].split("Metadata: ", 1)[1].split("\n", 1)[0])["path"] + for message in visual_messages + ] == [ + "C:/tmp/frame_3.png", + "C:/tmp/frame_2.png", + "C:/tmp/frame_0.png", + ] + + +def test_context_compaction_event_includes_visual_budget_reason_and_paths(tmp_path: Path, monkeypatch) -> None: + events: list[dict[str, object]] = [] + agent = _build_agent(tmp_path, monkeypatch) + agent.event_callback = events.append + agent.step = 5 + agent.recent_tool_summaries = ["step=4 tool=enhance status=ok"] + agent.visual_context_messages = [ + {"message": {"role": "user", "content": []}, "meta": {"path": "C:/tmp/1.png"}}, + {"message": {"role": "user", "content": []}, "meta": {"path": "C:/tmp/2.png"}}, + {"message": {"role": "user", "content": []}, "meta": {"path": "C:/tmp/3.png"}}, + ] + + agent._emit_context_compacted("visual_budget") + + assert events[-1]["event_type"] == "context_compacted" + payload = events[-1]["payload"] + assert payload["rebuild_reason"] == "visual_budget" + assert payload["visual_context_paths"] == ["C:/tmp/1.png", "C:/tmp/2.png", "C:/tmp/3.png"] + + +def test_observation_loop_blocks_repeated_broad_reobservation(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.step_history = [ + { + "step": 21, + "tool_names": ["get_active_window", "see_screen"], + "window_signature": "123|#32770|Save as", + "window_summary": "Save as [#32770]", + "had_visual": True, + }, + { + "step": 22, + "tool_names": ["get_active_window", "see_screen"], + "window_signature": "123|#32770|Save as", + "window_summary": "Save as [#32770]", + "had_visual": True, + }, + { + "step": 23, + "tool_names": ["get_active_window", "see_screen"], + "window_signature": "123|#32770|Save as", + "window_summary": "Save as [#32770]", + "had_visual": True, + }, + ] + + blocked = agent._dispatch_tool("see_screen", {}) + + assert blocked["ok"] is False + assert blocked["blocked"] is True + assert blocked["blocked_reason"] == "observation_loop" + assert "unchanged foreground window" in blocked["error"] + assert blocked["window_summary"] == "Save as [#32770]" + + +def test_repeated_ambiguous_action_requires_verification_and_then_blocks(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + type_args = {"text": "repeat me"} + + first = agent._dispatch_tool("type", type_args) + assert first["ok"] is True + assert first["verification_required"] is True + assert first["verification_channels"] == ["enhance", "get_active_window", "see_screen"] + + blocked_without_verification = agent._dispatch_tool("type", type_args) + assert blocked_without_verification["blocked"] is True + assert "see_screen" in blocked_without_verification["error"] + + assert agent._dispatch_tool("see_screen", {})["ok"] is True + assert agent._dispatch_tool("type", type_args)["ok"] is True + assert agent._dispatch_tool("see_screen", {})["ok"] is True + assert agent._dispatch_tool("type", type_args)["ok"] is True + assert agent._dispatch_tool("see_screen", {})["ok"] is True + + blocked_after_retry_budget = agent._dispatch_tool("type", type_args) + assert blocked_after_retry_budget["blocked"] is True + assert "3 time(s) on the same surface" in blocked_after_retry_budget["error"] + + assert agent._dispatch_tool("see_screen", {})["ok"] is True + reset_attempt = agent._dispatch_tool("type", type_args) + assert reset_attempt["ok"] is True + + +def test_copy_shortcut_prefers_clipboard_verification(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + monkeypatch.setattr( + agent, + "_clipboard_get_metadata", + lambda: {"has_text": True, "has_image": False, "available_formats": ["CF_UNICODETEXT"]}, + ) + monkeypatch.setattr(agent, "_clipboard_get_text", lambda: "copied") + + first = agent._dispatch_tool("press_key", {"key": "ctrl+c"}) + assert first["ok"] is True + assert first["verification_channels"] == ["clipboard_get"] + + blocked = agent._dispatch_tool("press_key", {"key": "ctrl+c"}) + assert blocked["blocked"] is True + assert "clipboard_get" in blocked["error"] + + observed = agent._dispatch_tool("clipboard_get", {}) + assert observed["ok"] is True + assert observed["has_text"] is True + + second = agent._dispatch_tool("press_key", {"key": "ctrl+c"}) + assert second["ok"] is True + + +def test_execute_command_blocks_unrequested_recursive_file_search(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.objective = "Save the current note in Notepad" + + result = agent._tool_execute_command({"command": "Get-ChildItem -Recurse -Filter *.txt"}) + + assert result["ok"] is False + assert result["blocked"] is True + assert "out of scope" in result["error"] + + +def test_execute_command_allows_recursive_file_search_when_objective_requests_it(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.objective = "Find the saved text file path" + + called: dict[str, Any] = {} + + class _FakeProcess: + returncode = 0 + + def poll(self) -> int: + return 0 + + def communicate(self, timeout: int = 2): + return ("ok", "") + + def fake_popen(*args, **kwargs): + called["command"] = args[0] + return _FakeProcess() + + monkeypatch.setattr(agent_module.subprocess, "Popen", fake_popen) + + result = agent._tool_execute_command({"command": "Get-ChildItem -Recurse -Filter *.txt"}) + + assert result["ok"] is True + assert called["command"] == "Get-ChildItem -Recurse -Filter *.txt" + + +def test_execute_command_launch_requires_focus_verification(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + called: dict[str, Any] = {} + + class _FakeProcess: + returncode = 0 + + def poll(self) -> int: + return 0 + + def communicate(self, timeout: int = 2): + return ("", "") + + def fake_popen(*args, **kwargs): + called["command"] = args[0] + return _FakeProcess() + + monkeypatch.setattr(agent_module.subprocess, "Popen", fake_popen) + + first = agent._dispatch_tool("execute_command", {"command": "start notepad"}) + + assert first["ok"] is True + assert first["background_launch_assumed"] is True + assert first["focus_change_assumed"] is False + assert first["verification_required"] is True + assert first["verification_channels"] == ["get_active_window", "see_screen"] + assert called["command"] == "start notepad" + + blocked = agent._dispatch_tool("execute_command", {"command": "start notepad"}) + assert blocked["blocked"] is True + assert "get_active_window" in blocked["error"] + + observed = agent._dispatch_tool("get_active_window", {}) + assert observed["ok"] is True + + second = agent._dispatch_tool("execute_command", {"command": "start notepad"}) + assert second["ok"] is True + + +def test_system_prompt_emphasizes_situational_awareness() -> None: + prompt = agent_module.SYSTEM_PROMPT + + assert "Maintain a live mental model" in prompt + assert "classify -> choose control channel -> execute one meaningful transition -> verify" in prompt + assert "First classify, then act." in prompt + assert "Use see_screen at a balanced cadence" in prompt + assert "get_active_window" in prompt + assert "detect_dialog" in prompt + assert "dialog_set_filename" in prompt + assert "list_ui_elements" in prompt + assert "clipboard_get" in prompt + assert "Do not invent new subgoals" in prompt + assert "verify-and-finish" in prompt + assert "data.observed_result" in prompt + assert "Treat command-launched apps or URLs as background" in prompt + assert "#32770" in prompt + assert "secure desktop" in prompt.lower() + + +def test_observation_loop_prompt_pushes_action_or_finish() -> None: + prompt = agent_module.build_observation_loop_prompt("Save as [#32770]", repeated_steps=3) + + assert "same stable window for 3 step(s)" in prompt + assert "Save as [#32770]" in prompt + assert "Do not keep calling broad observation tools" in prompt + assert "native window/dialog/element tool" in prompt + assert "Use enhance only if a small or text-heavy control must be read before acting." in prompt + assert "#32770 dialog" in prompt + + +def test_finish_likely_prompt_pushes_verification_then_completion() -> None: + prompt = agent_module.build_finish_likely_prompt( + 'Save dialog closed and focus returned to "todo-demo.txt - Notepad". | Command verification confirms "todo-demo.txt" exists.', + prohibited_key_combos={"ctrl+shift+s"}, + ) + + assert "objective is likely already satisfied" in prompt + assert "todo-demo.txt - Notepad" in prompt + assert "call see_screen" in prompt + assert "then call task_complete" in prompt + assert "Do not reopen menus" in prompt + assert "Prohibited key combos for this run: ctrl+shift+s." in prompt + + +def test_initial_action_prompt_reinforces_observation_and_verification() -> None: + prompt = agent_module.build_initial_action_prompt("Open calculator", {"ctrl+shift+s"}) + + assert "JOB: Open calculator" in prompt + assert "First classify the current UI state from the latest evidence." in prompt + assert "Identify what changed since the last action or screen capture." in prompt + assert "classify -> choose control channel -> execute one meaningful transition -> verify" in prompt + assert "Prefer native window/dialog/element tools" in prompt + assert "get_active_window plus detect_dialog" in prompt + assert "click then see_screen" in prompt + assert "Do not invent new subgoals" in prompt + assert "Prefer non-visual verification when available" in prompt + assert "wait_for_focus_change" in prompt + assert "#32770 dialogs" in prompt + assert "Prohibited key combos for this run: ctrl+shift+s." in prompt + assert "do not re-capture the screen just to reconfirm an obvious large input area" in prompt + assert 'task_complete(return=..., data={"observed_result": ...})' in prompt + + +def test_no_tool_prompt_recovers_by_reobserving() -> None: + prompt = agent_module.build_no_tool_prompt({"ctrl+shift+s"}) + + assert "Recover by re-observing the current desktop state instead of guessing." in prompt + assert "Start by classifying the surface." in prompt + assert "get_active_window" in prompt + assert "detect_dialog" in prompt + assert "clipboard_get" in prompt + assert "native window/dialog/element tools" in prompt + assert "Do not assume execute_command launches changed the foreground window" in prompt + assert "Prohibited key combos for this run: ctrl+shift+s." in prompt + assert "If a modal, picker, or browser download/upload surface is likely" in prompt + + +def test_blocked_action_prompt_reanchors_on_screen_state() -> None: + prompt = agent_module.build_blocked_action_prompt("click", prohibited_key_combos={"ctrl+shift+s"}) + + assert "The last action using click was blocked or unreliable." in prompt + assert "Do not retry blindly." in prompt + assert "classify the current surface" in prompt + assert "detect_dialog" in prompt + assert "dialog_set_filename" in prompt + assert "get_active_window" in prompt + assert "get_cursor_position before move_mouse or drag" in prompt + assert "wait_for_focus_change" in prompt + assert "secure desktop or UAC" in prompt + assert "Switch strategy after the fresh classification" in prompt + assert "Prohibited key combos for this run: ctrl+shift+s." in prompt + assert "native control instead of pixels" in prompt + + +def test_tool_schemas_include_completion_and_desktop_awareness_guidance(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.prohibited_key_combos = {"ctrl+shift+s"} + schemas = {tool["name"]: tool for tool in agent._tool_schemas()} + + assert "data.observed_result" in schemas["task_complete"]["description"] + assert "before task_complete" in schemas["see_screen"]["description"] + assert "text-heavy targets" in schemas["enhance"]["description"] + assert "verify copy or cut results" in schemas["clipboard_get"]["description"] + assert "pointer state matters" in schemas["get_cursor_position"]["description"] + assert "verify focus and active app" in schemas["get_active_window"]["description"] + assert "foreground focus" in schemas["execute_command"]["description"] + assert "Prohibited for this run: ctrl+shift+s." in schemas["press_key"]["description"] + assert "dialog classification" in schemas["get_active_window"]["description"] + assert "visible top-level windows" in schemas["list_windows"]["description"] + assert "#32770 or picker surface" in schemas["detect_dialog"]["description"] + assert "filename or path field" in schemas["dialog_set_filename"]["description"] + assert "native child controls" in schemas["list_ui_elements"]["description"] + + +def test_tool_schemas_hide_optional_native_tools_when_mode_off(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.options.native_automation_mode = "off" + + schemas = {tool["name"]: tool for tool in agent._tool_schemas()} + + assert "get_active_window" in schemas + assert "list_windows" not in schemas + assert "detect_dialog" not in schemas + assert "list_ui_elements" not in schemas + + +def test_list_windows_returns_structured_surface_metadata(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + monkeypatch.setattr( + agent, + "_list_windows_info", + lambda visible_only=True: [ + { + "available": True, + "hwnd": 111, + "title": "Open", + "class_name": "#32770", + "executable_name": "notepad.exe", + "surface_kind": "file_dialog", + "dialog_kind": "file_open", + } + ], + ) + monkeypatch.setattr( + agent, + "_get_active_window_info", + lambda: { + "available": True, + "hwnd": 111, + "title": "Open", + "class_name": "#32770", + "executable_name": "notepad.exe", + }, + ) + + result = agent._tool_list_windows({}) + + assert result["ok"] is True + assert result["count"] == 1 + assert result["surface_kind"] == "file_dialog" + assert result["dialog_kind"] == "file_open" + assert result["recommended_next_tools"][0] == "dialog_set_filename" + + +def test_detect_dialog_returns_buttons_and_target_handle(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + monkeypatch.setattr( + agent, + "_find_dialog_info", + lambda title_contains="": { + "available": True, + "hwnd": 222, + "title": "Save as", + "class_name": "#32770", + "executable_name": "notepad.exe", + }, + ) + monkeypatch.setattr( + agent, + "_get_active_window_info", + lambda: { + "available": True, + "hwnd": 222, + "title": "Save as", + "class_name": "#32770", + "executable_name": "notepad.exe", + }, + ) + monkeypatch.setattr( + agent, + "_list_ui_elements_for_window", + lambda hwnd, include_hidden=False: [ + { + "handle": 10, + "role": "button", + "text": "Save", + "target": {"type": "ui_element", "handle": 10, "window_handle": hwnd}, + } + ], + ) + + result = agent._tool_detect_dialog({}) + + assert result["ok"] is True + assert result["dialog_kind"] == "file_save" + assert result["target"]["type"] == "dialog" + assert result["buttons"][0]["text"] == "Save" + + +def test_notepad_save_pattern_enters_finish_likely_mode(tmp_path: Path, monkeypatch) -> None: + events: list[dict[str, object]] = [] + agent = _build_agent(tmp_path, monkeypatch) + agent.event_callback = events.append + agent.objective = "Open Notepad, type a short to-do list, save it as todo-demo.txt in Documents" + agent.finish_likely_state["target_filename"] = agent._infer_target_filename(agent.objective) + agent.last_observed_window = { + "available": True, + "title": "Save as", + "class_name": "#32770", + } + + agent.step = 24 + window_result = agent._update_finish_likely_from_tool( + "get_active_window", + {}, + { + "ok": True, + "window": { + "available": True, + "title": "todo-demo.txt - Notepad", + "class_name": "Notepad", + }, + }, + ) + + assert agent.finish_likely_state["active"] is False + assert [item["kind"] for item in window_result["completion_evidence"]] == [ + "active_window_title_matches_target", + "save_dialog_closed_to_target_window", + ] + + agent.last_visual_signature = "stable-post-save" + agent.step = 25 + command_result = agent._update_finish_likely_from_tool( + "execute_command", + {"command": "powershell -NoProfile -Command \"Test-Path ... todo-demo.txt\""}, + { + "ok": True, + "exit_code": 0, + "stdout": r"C:\Users\paulw\Documents\todo-demo.txt", + }, + ) + + assert agent.finish_likely_state["active"] is True + assert agent.finish_likely_state["summary"] + assert command_result["finish_likely"]["target_filename"] == "todo-demo.txt" + assert any(event["event_type"] == "completion_evidence" for event in events) + assert any(event["event_type"] == "finish_likely" for event in events) + + +def test_finish_likely_guard_blocks_reopening_menu_after_fresh_verification(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.objective = "Open Notepad, type a short to-do list, save it as todo-demo.txt in Documents" + agent.finish_likely_state.update( + { + "active": True, + "activated_at_step": 24, + "target_filename": "todo-demo.txt", + "summary": 'Save dialog closed and focus returned to "todo-demo.txt - Notepad". | Command verification confirms "todo-demo.txt" exists.', + "fresh_verification_done": False, + "verification_step": 0, + "post_completion_visual_signature": "", + } + ) + + agent.step = 25 + verify_result = agent._dispatch_tool("see_screen", {}) + assert verify_result["ok"] is True + assert verify_result["finish_likely_verification_done"] is True + assert agent.finish_likely_state["fresh_verification_done"] is True + + blocked = agent._dispatch_tool("press_key", {"key": "alt+f"}) + assert blocked["ok"] is False + assert blocked["blocked"] is True + assert blocked["blocked_reason"] == "finish_likely" + assert "appears satisfied" in blocked["error"] + assert "reopen menus" in blocked["hint"].lower() + + +def test_dispatch_rejects_unknown_and_disabled_tools(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.disabled_tools = {"scroll"} + + assert agent._dispatch_tool("unknown_tool", {}) == {"ok": False, "error": "Unknown tool: unknown_tool"} + assert agent._dispatch_tool("scroll", {}) == {"ok": False, "error": "Tool 'scroll' is disabled for this job."} + + +def test_tool_schemas_filter_disabled_tools(tmp_path: Path, monkeypatch) -> None: + agent = _build_agent(tmp_path, monkeypatch) + agent.disabled_tools = {"scroll", "clipboard_get"} + + tool_names = {tool["name"] for tool in agent._tool_schemas()} + + assert "scroll" not in tool_names + assert "clipboard_get" not in tool_names + assert "click" in tool_names + assert "task_complete" in tool_names + + +def test_normalize_disabled_tools_rejects_invalid_and_required_names() -> None: + with pytest.raises(ValueError, match="Unknown disabled tool"): + agent_module.normalize_disabled_tools(["not_a_real_tool"]) + + with pytest.raises(ValueError, match="Cannot disable required tool"): + agent_module.normalize_disabled_tools(["task_complete"]) diff --git a/tests/test_cli.py b/tests/test_cli.py index 042a420..f159c34 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -20,6 +20,7 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path port=8787, runs_dir=tmp_path / "runs", db_path=tmp_path / "screenjob.db", + prohibited_key_combos=("ctrl+shift+s",), ) config.runs_dir.mkdir(parents=True, exist_ok=True) @@ -71,3 +72,11 @@ def test_cli_emits_structured_return_and_data(monkeypatch: Any, capsys, tmp_path assert payload["data"] == "file1.txt\nfile2.txt" assert captured_kwargs["options"].reasoning_effort == "medium" assert captured_kwargs["options"].screen_context_decay_steps == 4 + assert captured_kwargs["options"].max_visual_context_images == 3 + assert captured_kwargs["options"].native_automation_mode == "prefer" + assert captured_kwargs["options"].dialog_timeout_seconds == 12.0 + assert captured_kwargs["options"].focus_timeout_seconds == 8.0 + assert captured_kwargs["options"].ui_element_timeout_seconds == 8.0 + assert captured_kwargs["options"].max_retries_per_surface == 3 + assert captured_kwargs["options"].pretty_logs is False + assert captured_kwargs["options"].prohibited_key_combos == {"ctrl+shift+s"} diff --git a/tests/test_server_api.py b/tests/test_server_api.py index 3fe8e45..a0ca2ad 100644 --- a/tests/test_server_api.py +++ b/tests/test_server_api.py @@ -46,6 +46,13 @@ class FakeJobManager: click_pause: float = 0.10, reasoning_effort: str = "medium", screen_context_decay_steps: int = 4, + max_visual_context_images: int = 3, + native_automation_mode: str = "prefer", + dialog_timeout_seconds: float = 12.0, + focus_timeout_seconds: float = 8.0, + ui_element_timeout_seconds: float = 8.0, + max_retries_per_surface: int = 3, + pretty_logs: bool = False, disabled_tools: list[str] | None = None, safety_override: bool = False, no_failsafe: bool = False, @@ -69,6 +76,13 @@ class FakeJobManager: "click_pause": click_pause, "reasoning_effort": reasoning_effort, "screen_context_decay_steps": screen_context_decay_steps, + "max_visual_context_images": max_visual_context_images, + "native_automation_mode": native_automation_mode, + "dialog_timeout_seconds": dialog_timeout_seconds, + "focus_timeout_seconds": focus_timeout_seconds, + "ui_element_timeout_seconds": ui_element_timeout_seconds, + "max_retries_per_surface": max_retries_per_surface, + "pretty_logs": pretty_logs, "no_failsafe": no_failsafe, } self._jobs[job_id] = { @@ -293,6 +307,7 @@ def _build_app(tmp_path: Path, monkeypatch: Any, disable_ui: bool = False): port=8787, runs_dir=tmp_path / "runs", db_path=tmp_path / "screenjob_test.db", + prohibited_key_combos=("ctrl+shift+s",), ) config.runs_dir.mkdir(parents=True, exist_ok=True) app = server_module.create_app(config) @@ -326,6 +341,13 @@ def test_create_job_returns_only_job_id_and_defaults_model(tmp_path: Path, monke assert manager.last_submit_payload["disabled_tools"] == ["click"] assert manager.last_submit_payload["reasoning_effort"] == "medium" assert manager.last_submit_payload["screen_context_decay_steps"] == 4 + assert manager.last_submit_payload["max_visual_context_images"] == 3 + assert manager.last_submit_payload["native_automation_mode"] == "prefer" + assert manager.last_submit_payload["dialog_timeout_seconds"] == 12.0 + assert manager.last_submit_payload["focus_timeout_seconds"] == 8.0 + assert manager.last_submit_payload["ui_element_timeout_seconds"] == 8.0 + assert manager.last_submit_payload["max_retries_per_surface"] == 3 + assert manager.last_submit_payload["pretty_logs"] is False status_res = client.get(f"/api/jobs/{job_id}/status", headers=headers) assert status_res.status_code == 200 @@ -334,6 +356,36 @@ def test_create_job_returns_only_job_id_and_defaults_model(tmp_path: Path, monke assert "data" in status_res.json()["response"] +def test_create_job_rejects_invalid_disabled_tool_names(tmp_path: Path, monkeypatch: Any) -> None: + app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False) + client = TestClient(app) + headers = {"Authorization": "Bearer test_token"} + + response = client.post( + "/api/jobs", + headers=headers, + json={"job": "Open amazon.de", "disabled_tools": ["not_a_real_tool"], "safety_override": True}, + ) + + assert response.status_code == 400 + assert "Unknown disabled tool" in response.json()["detail"] + + +def test_create_job_rejects_disabling_task_complete(tmp_path: Path, monkeypatch: Any) -> None: + app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False) + client = TestClient(app) + headers = {"Authorization": "Bearer test_token"} + + response = client.post( + "/api/jobs", + headers=headers, + json={"job": "Open amazon.de", "disabled_tools": ["task_complete"], "safety_override": True}, + ) + + assert response.status_code == 400 + assert "Cannot disable required tool" in response.json()["detail"] + + def test_cancel_endpoint_and_events(tmp_path: Path, monkeypatch: Any) -> None: app, _ = _build_app(tmp_path, monkeypatch, disable_ui=False) client = TestClient(app)