from __future__ import annotations import json import logging import subprocess import threading import time import traceback from typing import Any, Callable from openai import OpenAI from PIL import Image, ImageEnhance, ImageFilter, ImageOps from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary from .pricing import estimate_cost_usd from .utils import clamp, draw_global_grid, image_to_data_url, utc_now_iso try: import pyautogui except Exception as import_exc: pyautogui = None # type: ignore[assignment] _PYAUTOGUI_IMPORT_ERROR = import_exc else: _PYAUTOGUI_IMPORT_ERROR = None SYSTEM_PROMPT = """ You are ScreenJob, an autonomous desktop-and-terminal task executor. Rules: 1) Use tools to act. Do not claim actions without tool calls. 2) Prefer execute_command for deterministic actions: - opening URLs/websites (Windows: start https://amazon.de) - launching apps or running terminal checks 3) For UI tasks, inspect with see_screen before clicking/typing. 4) Coordinates are absolute screen pixels (x, y) from top-left. 5) Use enhance(coordinate) when text/UI is unclear. 6) For keyboard-heavy interactions, prefer press_key for special keys. 7) You may call multiple tools in one step. If needed, do click then sleep. 8) Never spam repeated clicks on the same coordinate; switch strategy. 9) Keep tool arguments valid JSON and concise. 10) When objective is fully complete, call task_complete(result="..."). """ class ScreenJobAgent: def __init__( self, client: OpenAI, logger: logging.Logger, artifacts: RunArtifacts, options: RuntimeOptions, cancel_event: threading.Event | None = None, event_callback: Callable[[dict[str, Any]], None] | None = None, ) -> None: if pyautogui is None: raise RuntimeError( "pyautogui is required for agent execution. " "Install dependencies and ensure GUI access. " f"Import error: {_PYAUTOGUI_IMPORT_ERROR}" ) self.client = client self.logger = logger self.artifacts = artifacts self.options = options self.cancel_event = cancel_event or threading.Event() self.event_callback = event_callback self.step = 0 self.completed = False self.final_result = "" self.previous_response_id: str | None = None self.usage = UsageSummary() self.last_screen_data_url: str | None = None self.last_screen_meta: dict[str, Any] | None = None self.click_history: list[tuple[int, int, float]] = [] self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()} def _emit(self, event_type: str, payload: dict[str, Any]) -> None: if self.event_callback is None: return event = { "ts": utc_now_iso(), "job_run_id": self.artifacts.run_id, "step": self.step, "event_type": event_type, "payload": payload, } try: self.event_callback(event) except Exception: # noqa: BLE001 self.logger.debug("Event callback failed.", exc_info=True) def _is_cancelled(self) -> bool: return bool(self.cancel_event.is_set()) def _register_usage(self, response: Any) -> None: usage_obj = getattr(response, "usage", None) if usage_obj is None: return input_tokens = int(getattr(usage_obj, "input_tokens", 0) or 0) output_tokens = int(getattr(usage_obj, "output_tokens", 0) or 0) total_tokens = int(getattr(usage_obj, "total_tokens", input_tokens + output_tokens) or 0) input_details = getattr(usage_obj, "input_tokens_details", None) cached_tokens = int(getattr(input_details, "cached_tokens", 0) or 0) if input_details else 0 output_details = getattr(usage_obj, "output_tokens_details", None) reasoning_tokens = int(getattr(output_details, "reasoning_tokens", 0) or 0) if output_details else 0 self.usage.input_tokens += input_tokens self.usage.cached_input_tokens += cached_tokens self.usage.output_tokens += output_tokens self.usage.reasoning_tokens += reasoning_tokens self.usage.total_tokens += total_tokens estimated_cost, model_for_pricing = estimate_cost_usd(self.options.model, self.usage) self.usage.estimated_cost_usd = estimated_cost self.usage.model_for_pricing = model_for_pricing self._emit( "usage_update", { "usage": self.usage.to_dict(), "response_id": getattr(response, "id", None), }, ) def _tool_schemas(self) -> list[dict[str, Any]]: all_tools: list[dict[str, Any]] = [ { "type": "function", "name": "task_complete", "description": "Call this when the job objective is fully done.", "parameters": { "type": "object", "properties": { "result": {"type": "string"}, }, "required": ["result"], "additionalProperties": False, }, }, { "type": "function", "name": "execute_command", "description": ( "Run a shell command and return stdout/stderr/exit code. " "Prefer this for deterministic operations like opening URLs." ), "parameters": { "type": "object", "properties": { "command": {"type": "string"}, }, "required": ["command"], "additionalProperties": False, }, }, { "type": "function", "name": "sleep", "description": ( "Pause execution for a short time. " "Use this instead of shell sleep commands." ), "parameters": { "type": "object", "properties": { "seconds": {"type": ["number", "string"]}, }, "required": ["seconds"], "additionalProperties": False, }, }, { "type": "function", "name": "see_screen", "description": "Capture full screen with coordinate grid overlay.", "parameters": { "type": "object", "properties": {}, "additionalProperties": False, }, }, { "type": "function", "name": "enhance", "description": "Create enhanced zoom around a coordinate for readability.", "parameters": { "type": "object", "properties": { "coordinate": { "type": "object", "properties": { "x": {"type": "integer"}, "y": {"type": "integer"}, }, "required": ["x", "y"], "additionalProperties": False, } }, "required": ["coordinate"], "additionalProperties": False, }, }, { "type": "function", "name": "type", "description": "Type literal text into the active focused element.", "parameters": { "type": "object", "properties": {"text": {"type": "string"}}, "required": ["text"], "additionalProperties": False, }, }, { "type": "function", "name": "press_key", "description": "Press a specific key (enter, tab, esc, arrows, etc).", "parameters": { "type": "object", "properties": { "key": {"type": "string"}, "repeats": {"type": "integer", "minimum": 1}, }, "required": ["key"], "additionalProperties": False, }, }, { "type": "function", "name": "click", "description": ( "Click absolute screen coordinate with simple directional offsets. " "Use offset_up/down/left/right values like 2 or '2px'. " "Optional sleep_after_seconds performs a pause immediately after click." ), "parameters": { "type": "object", "properties": { "coordinate": { "type": "object", "properties": { "x": {"type": "integer"}, "y": {"type": "integer"}, }, "required": ["x", "y"], "additionalProperties": False, }, "offset": { "type": "object", "properties": { "x": {"type": "integer"}, "y": {"type": "integer"}, }, "required": [], "additionalProperties": False, }, "offset_up": {"type": ["integer", "string"]}, "offset_down": {"type": ["integer", "string"]}, "offset_left": {"type": ["integer", "string"]}, "offset_right": {"type": ["integer", "string"]}, "sleep_after_seconds": {"type": ["number", "string"]}, }, "required": ["coordinate"], "additionalProperties": False, }, }, ] return [tool for tool in all_tools if tool["name"] not in self.disabled_tools] def _capture_screen(self, with_grid: bool = True) -> tuple[Image.Image, dict[str, Any]]: screenshot = pyautogui.screenshot().convert("RGB") width, height = screenshot.size image = draw_global_grid(screenshot) if with_grid else screenshot meta = { "width": width, "height": height, "captured_at": utc_now_iso(), "grid": bool(with_grid), } return image, meta def _save_image(self, image: Image.Image, path) -> None: path.parent.mkdir(parents=True, exist_ok=True) image.save(path, format="PNG") def _build_visual_message(self, title: str, data_url: str, meta: dict[str, Any]) -> dict[str, Any]: text = ( f"{title}\n" f"Metadata: {json.dumps(meta, ensure_ascii=False)}\n" "Use coordinates from this image for click/enhance actions." ) return { "role": "user", "content": [ {"type": "input_text", "text": text}, {"type": "input_image", "image_url": data_url, "detail": "high"}, ], } def _parse_px(self, value: Any) -> int: if value is None: return 0 if isinstance(value, bool): return int(value) if isinstance(value, int): return value if isinstance(value, float): return int(round(value)) text = str(value).strip().lower() if text.endswith("px"): text = text[:-2].strip() try: return int(float(text)) except Exception: # noqa: BLE001 return 0 def _parse_seconds(self, value: Any, default: float = 0.0, max_seconds: float = 60.0) -> float: if value is None: return default if isinstance(value, (int, float)): sec = float(value) else: text = str(value).strip().lower() if text.endswith("ms"): try: sec = float(text[:-2].strip()) / 1000.0 except Exception: # noqa: BLE001 sec = default else: if text.endswith("s"): text = text[:-1].strip() try: sec = float(text) except Exception: # noqa: BLE001 sec = default if sec < 0: sec = 0.0 if sec > max_seconds: sec = max_seconds return sec def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]: image, meta = self._capture_screen(with_grid=True) out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png" self._save_image(image, out_path) data_url = image_to_data_url(image, "PNG") self.last_screen_data_url = data_url self.last_screen_meta = meta | {"path": str(out_path.resolve())} return { "ok": True, "path": str(out_path.resolve()), "meta": self.last_screen_meta, "message": "Screen captured with coordinate grid.", } def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]: coord = args.get("coordinate") or {} x = int(coord.get("x", 0)) y = int(coord.get("y", 0)) base, base_meta = self._capture_screen(with_grid=False) width, height = base.size region_half = 180 left = clamp(x - region_half, 0, width - 1) top = clamp(y - region_half, 0, height - 1) right = clamp(x + region_half, left + 1, width) bottom = clamp(y + region_half, top + 1, height) crop = base.crop((left, top, right, bottom)) upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC) enhanced = ImageOps.autocontrast(upscaled) enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0) enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25) enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2)) out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png" self._save_image(enhanced, out_path) data_url = image_to_data_url(enhanced, "PNG") meta = { "captured_at": utc_now_iso(), "source_coord": {"x": x, "y": y}, "source_box": {"left": left, "top": top, "right": right, "bottom": bottom}, "scale": 2, "path": str(out_path.resolve()), "screen_size": {"width": width, "height": height}, "base_capture_meta": base_meta, } self.last_screen_data_url = data_url self.last_screen_meta = meta return {"ok": True, "meta": meta, "message": "Enhanced view generated."} def _tool_click(self, args: dict[str, Any]) -> dict[str, Any]: coord = args.get("coordinate") or {} offset = args.get("offset") or {} base_x = int(coord.get("x", 0)) base_y = int(coord.get("y", 0)) legacy_dx = self._parse_px(offset.get("x", 0)) legacy_dy = self._parse_px(offset.get("y", 0)) up = self._parse_px(args.get("offset_up", 0)) down = self._parse_px(args.get("offset_down", 0)) left = self._parse_px(args.get("offset_left", 0)) right = self._parse_px(args.get("offset_right", 0)) x = base_x + legacy_dx + right - left y = base_y + legacy_dy + down - up width, height = pyautogui.size() x = clamp(x, 0, max(0, width - 1)) y = clamp(y, 0, max(0, height - 1)) now = time.time() self.click_history.append((x, y, now)) self.click_history = self.click_history[-20:] very_recent = [(cx, cy, ts) for (cx, cy, ts) in self.click_history if now - ts <= 25] near_same = [ (cx, cy, ts) for (cx, cy, ts) in very_recent if abs(cx - x) <= 6 and abs(cy - y) <= 6 ] if len(near_same) >= 4: return { "ok": False, "blocked": True, "error": ( "Repeated click loop detected at nearly same coordinate. " "Switch strategy: call see_screen/enhance and use execute_command." ), "clicked": {"x": x, "y": y}, "recent_similar_clicks": len(near_same), } pyautogui.moveTo(x, y, duration=self.options.click_pause) pyautogui.click(x=x, y=y) sleep_after = self._parse_seconds(args.get("sleep_after_seconds", 0), default=0.0, max_seconds=30.0) wait_remaining = sleep_after if sleep_after > 0 else 0.15 while wait_remaining > 0: if self._is_cancelled(): break interval = min(0.05, wait_remaining) time.sleep(interval) wait_remaining -= interval return { "ok": True, "clicked": {"x": x, "y": y}, "base_coordinate": {"x": base_x, "y": base_y}, "applied_offset": { "legacy": {"x": legacy_dx, "y": legacy_dy}, "directional": {"up": up, "down": down, "left": left, "right": right}, }, "sleep_after_seconds": sleep_after, "screen_size": {"width": width, "height": height}, "message": "Click executed.", } def _tool_type(self, args: dict[str, Any]) -> dict[str, Any]: text = str(args.get("text", "")) for char in text: if self._is_cancelled(): return {"ok": False, "cancelled": True, "typed_length": 0} pyautogui.write(char, interval=0) time.sleep(self.options.type_interval) return {"ok": True, "typed_length": len(text), "message": "Text typed."} def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]: key = str(args.get("key", "")).strip().lower() repeats = max(1, int(args.get("repeats", 1))) if not key: return {"ok": False, "error": "Missing key."} repeats = min(repeats, 50) pressed = 0 for _ in range(repeats): if self._is_cancelled(): break pyautogui.press(key) pressed += 1 time.sleep(0.03) return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."} def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]: seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0) elapsed = 0.0 while elapsed < seconds: if self._is_cancelled(): return {"ok": False, "cancelled": True, "slept_seconds": round(elapsed, 3)} interval = min(0.1, seconds - elapsed) time.sleep(interval) elapsed += interval return {"ok": True, "slept_seconds": round(seconds, 3), "message": "Sleep completed."} def _tool_execute_command(self, args: dict[str, Any]) -> dict[str, Any]: command = str(args.get("command", "")).strip() if not command: return {"ok": False, "error": "Empty command."} started = time.time() process: subprocess.Popen[str] | None = None try: process = subprocess.Popen( command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) while True: if self._is_cancelled(): process.terminate() return { "ok": False, "cancelled": True, "command": command, "elapsed_ms": int((time.time() - started) * 1000), } if process.poll() is not None: break if (time.time() - started) > self.options.command_timeout: process.kill() stdout, stderr = process.communicate(timeout=2) return { "ok": False, "command": command, "error": "Command timed out.", "elapsed_ms": int((time.time() - started) * 1000), "timeout_seconds": self.options.command_timeout, "stdout": (stdout or "")[-12000:], "stderr": (stderr or "")[-12000:], } time.sleep(0.05) stdout, stderr = process.communicate(timeout=2) return { "ok": True, "command": command, "exit_code": process.returncode, "stdout": (stdout or "")[-12000:], "stderr": (stderr or "")[-12000:], "elapsed_ms": int((time.time() - started) * 1000), } except Exception as exc: # noqa: BLE001 if process is not None and process.poll() is None: try: process.kill() except Exception: # noqa: BLE001 pass return {"ok": False, "command": command, "error": f"{type(exc).__name__}: {exc}"} def _tool_task_complete(self, args: dict[str, Any]) -> dict[str, Any]: result = str(args.get("result", "")).strip() or "Task completed." self.completed = True self.final_result = result return {"ok": True, "result": result} def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]: if name in self.disabled_tools: return {"ok": False, "error": f"Tool '{name}' is disabled for this job."} handlers = { "see_screen": self._tool_see_screen, "enhance": self._tool_enhance, "click": self._tool_click, "type": self._tool_type, "press_key": self._tool_press_key, "sleep": self._tool_sleep, "execute_command": self._tool_execute_command, "task_complete": self._tool_task_complete, } handler = handlers.get(name) if handler is None: return {"ok": False, "error": f"Unknown tool: {name}"} return handler(args) def _safe_parse_args(self, raw: str | None) -> dict[str, Any]: if not raw: return {} try: parsed = json.loads(raw) return parsed if isinstance(parsed, dict) else {"value": parsed} except Exception: # noqa: BLE001 return {"_raw": raw} def _call_model(self, input_items: list[dict[str, Any]]) -> Any: return self.client.responses.create( model=self.options.model, instructions=SYSTEM_PROMPT, tools=self._tool_schemas(), input=input_items, previous_response_id=self.previous_response_id, parallel_tool_calls=True, max_tool_calls=8, ) def run(self, job: str) -> AgentResult: started_at = time.time() self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model) self.logger.info("Job: %s", job) self.logger.info("Disabled tools: %s", sorted(self.disabled_tools)) self._emit( "job_started", { "run_id": self.artifacts.run_id, "model": self.options.model, "objective": job, "disabled_tools": sorted(self.disabled_tools), }, ) self._tool_see_screen({}) init_input: list[dict[str, Any]] = [ { "role": "user", "content": [ { "type": "input_text", "text": ( f"JOB: {job}\n" "You are in an action loop. Prefer execute_command for deterministic actions. " "You can return multiple tool calls in one step (example: click then sleep). " "Call task_complete(result=...) only when truly done." ), } ], } ] if self.last_screen_data_url and self.last_screen_meta: init_input.append( self._build_visual_message("Initial screen capture", self.last_screen_data_url, self.last_screen_meta) ) pending_input = init_input error_text: str | None = None cancelled = False while self.step < self.options.max_steps and not self.completed: if self._is_cancelled(): cancelled = True error_text = "Cancelled by user request." break self.step += 1 self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps) self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps}) try: response = self._call_model(pending_input) self._register_usage(response) except Exception as exc: # noqa: BLE001 self.logger.exception("OpenAI API call failed on step %d", self.step) error_text = f"OpenAI API call failed: {type(exc).__name__}: {exc}" break self.previous_response_id = response.id output_items = list(response.output or []) text_preview = getattr(response, "output_text", "") or "" if text_preview.strip(): self.logger.info("Model text: %s", text_preview.strip()[:500]) self._emit("model_text", {"step": self.step, "text": text_preview.strip()[:2000]}) tool_calls = [item for item in output_items if getattr(item, "type", None) == "function_call"] if not tool_calls: self.logger.warning("No tool calls returned; nudging model to continue with tools.") self._emit("step_warning", {"step": self.step, "message": "No tool calls; nudged model."}) pending_input = [ { "role": "user", "content": [ { "type": "input_text", "text": ( "No function call was returned. Continue by using tools. " "You may call multiple tools in one step. " "When complete, call task_complete(result=...)." ), } ], } ] continue next_input: list[dict[str, Any]] = [] for tool_call in tool_calls: if self._is_cancelled(): cancelled = True error_text = "Cancelled by user request." break name = str(getattr(tool_call, "name", "")) call_id = str(getattr(tool_call, "call_id", "")) args_raw = getattr(tool_call, "arguments", "{}") args = self._safe_parse_args(args_raw) self.logger.info("Tool call: %s args=%s", name, json.dumps(args, ensure_ascii=False)) self._emit( "tool_called", {"step": self.step, "tool": name, "args": args}, ) try: result = self._dispatch_tool(name, args) except Exception as exc: # noqa: BLE001 self.logger.exception("Tool execution failed: %s", name) result = { "ok": False, "error": f"{type(exc).__name__}: {exc}", "traceback": traceback.format_exc()[-8000:], } self.logger.debug( "Tool result for %s: %s", name, json.dumps(result, ensure_ascii=False)[:2500], ) self._emit("tool_result", {"step": self.step, "tool": name, "result": result}) next_input.append( { "type": "function_call_output", "call_id": call_id, "output": json.dumps(result, ensure_ascii=False), } ) if name in ("see_screen", "enhance") and self.last_screen_data_url and self.last_screen_meta: title = "Updated screen capture" if name == "see_screen" else "Enhanced screen region" next_input.append( self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta) ) self._emit( "visual_update", { "step": self.step, "kind": name, "image_meta": self.last_screen_meta, }, ) if cancelled: break pending_input = next_input ended_at = time.time() if self.completed: self.logger.info("Task completed in %d step(s).", self.step) self._emit("job_completed", {"result": self.final_result, "steps": self.step, "usage": self.usage.to_dict()}) return AgentResult( completed=True, result=self.final_result, steps=self.step, started_at=started_at, ended_at=ended_at, usage=self.usage, ) if cancelled: self.logger.warning("Run cancelled by user after %d step(s).", self.step) self._emit("job_cancelled", {"steps": self.step, "usage": self.usage.to_dict()}) return AgentResult( completed=False, result="Cancelled by user request.", steps=self.step, started_at=started_at, ended_at=ended_at, usage=self.usage, error=error_text, cancelled=True, ) if error_text: self.logger.error("Run failed: %s", error_text) self._emit("job_failed", {"steps": self.step, "error": error_text, "usage": self.usage.to_dict()}) return AgentResult( completed=False, result=error_text, steps=self.step, started_at=started_at, ended_at=ended_at, usage=self.usage, error=error_text, ) self.logger.warning("Stopped due to step limit (%d).", self.options.max_steps) result_text = f"Stopped after max steps ({self.options.max_steps}) without task_complete." self._emit("job_stopped", {"steps": self.step, "reason": "max_steps", "usage": self.usage.to_dict()}) return AgentResult( completed=False, result=result_text, steps=self.step, started_at=started_at, ended_at=ended_at, usage=self.usage, error=result_text, )