794 lines
31 KiB
Python
794 lines
31 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import traceback
|
|
from typing import Any, Callable
|
|
|
|
from openai import OpenAI
|
|
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
|
|
|
|
from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary
|
|
from .pricing import estimate_cost_usd
|
|
from .utils import clamp, draw_global_grid, image_to_data_url, utc_now_iso
|
|
|
|
try:
|
|
import pyautogui
|
|
except Exception as import_exc:
|
|
pyautogui = None # type: ignore[assignment]
|
|
_PYAUTOGUI_IMPORT_ERROR = import_exc
|
|
else:
|
|
_PYAUTOGUI_IMPORT_ERROR = None
|
|
|
|
|
|
SYSTEM_PROMPT = """
|
|
You are ScreenJob, an autonomous desktop-and-terminal task executor.
|
|
|
|
Rules:
|
|
1) Use tools to act. Do not claim actions without tool calls.
|
|
2) Prefer execute_command for deterministic actions:
|
|
- opening URLs/websites (Windows: start https://amazon.de)
|
|
- launching apps or running terminal checks
|
|
3) For UI tasks, inspect with see_screen before clicking/typing.
|
|
4) Coordinates are absolute screen pixels (x, y) from top-left.
|
|
5) Use enhance(coordinate) when text/UI is unclear.
|
|
6) For keyboard-heavy interactions, prefer press_key for special keys.
|
|
7) You may call multiple tools in one step. If needed, do click then sleep.
|
|
8) Never spam repeated clicks on the same coordinate; switch strategy.
|
|
9) Keep tool arguments valid JSON and concise.
|
|
10) When objective is fully complete, call task_complete(result="...").
|
|
"""
|
|
|
|
|
|
class ScreenJobAgent:
|
|
def __init__(
|
|
self,
|
|
client: OpenAI,
|
|
logger: logging.Logger,
|
|
artifacts: RunArtifacts,
|
|
options: RuntimeOptions,
|
|
cancel_event: threading.Event | None = None,
|
|
event_callback: Callable[[dict[str, Any]], None] | None = None,
|
|
) -> None:
|
|
if pyautogui is None:
|
|
raise RuntimeError(
|
|
"pyautogui is required for agent execution. "
|
|
"Install dependencies and ensure GUI access. "
|
|
f"Import error: {_PYAUTOGUI_IMPORT_ERROR}"
|
|
)
|
|
self.client = client
|
|
self.logger = logger
|
|
self.artifacts = artifacts
|
|
self.options = options
|
|
self.cancel_event = cancel_event or threading.Event()
|
|
self.event_callback = event_callback
|
|
|
|
self.step = 0
|
|
self.completed = False
|
|
self.final_result = ""
|
|
self.previous_response_id: str | None = None
|
|
self.usage = UsageSummary()
|
|
|
|
self.last_screen_data_url: str | None = None
|
|
self.last_screen_meta: dict[str, Any] | None = None
|
|
self.click_history: list[tuple[int, int, float]] = []
|
|
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
|
|
|
|
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
|
|
if self.event_callback is None:
|
|
return
|
|
event = {
|
|
"ts": utc_now_iso(),
|
|
"job_run_id": self.artifacts.run_id,
|
|
"step": self.step,
|
|
"event_type": event_type,
|
|
"payload": payload,
|
|
}
|
|
try:
|
|
self.event_callback(event)
|
|
except Exception: # noqa: BLE001
|
|
self.logger.debug("Event callback failed.", exc_info=True)
|
|
|
|
def _is_cancelled(self) -> bool:
|
|
return bool(self.cancel_event.is_set())
|
|
|
|
def _register_usage(self, response: Any) -> None:
|
|
usage_obj = getattr(response, "usage", None)
|
|
if usage_obj is None:
|
|
return
|
|
input_tokens = int(getattr(usage_obj, "input_tokens", 0) or 0)
|
|
output_tokens = int(getattr(usage_obj, "output_tokens", 0) or 0)
|
|
total_tokens = int(getattr(usage_obj, "total_tokens", input_tokens + output_tokens) or 0)
|
|
|
|
input_details = getattr(usage_obj, "input_tokens_details", None)
|
|
cached_tokens = int(getattr(input_details, "cached_tokens", 0) or 0) if input_details else 0
|
|
output_details = getattr(usage_obj, "output_tokens_details", None)
|
|
reasoning_tokens = int(getattr(output_details, "reasoning_tokens", 0) or 0) if output_details else 0
|
|
|
|
self.usage.input_tokens += input_tokens
|
|
self.usage.cached_input_tokens += cached_tokens
|
|
self.usage.output_tokens += output_tokens
|
|
self.usage.reasoning_tokens += reasoning_tokens
|
|
self.usage.total_tokens += total_tokens
|
|
estimated_cost, model_for_pricing = estimate_cost_usd(self.options.model, self.usage)
|
|
self.usage.estimated_cost_usd = estimated_cost
|
|
self.usage.model_for_pricing = model_for_pricing
|
|
|
|
self._emit(
|
|
"usage_update",
|
|
{
|
|
"usage": self.usage.to_dict(),
|
|
"response_id": getattr(response, "id", None),
|
|
},
|
|
)
|
|
|
|
def _tool_schemas(self) -> list[dict[str, Any]]:
|
|
all_tools: list[dict[str, Any]] = [
|
|
{
|
|
"type": "function",
|
|
"name": "task_complete",
|
|
"description": "Call this when the job objective is fully done.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"result": {"type": "string"},
|
|
},
|
|
"required": ["result"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"name": "execute_command",
|
|
"description": (
|
|
"Run a shell command and return stdout/stderr/exit code. "
|
|
"Prefer this for deterministic operations like opening URLs."
|
|
),
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"command": {"type": "string"},
|
|
},
|
|
"required": ["command"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"name": "sleep",
|
|
"description": (
|
|
"Pause execution for a short time. "
|
|
"Use this instead of shell sleep commands."
|
|
),
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"seconds": {"type": ["number", "string"]},
|
|
},
|
|
"required": ["seconds"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"name": "see_screen",
|
|
"description": "Capture full screen with coordinate grid overlay.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {},
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"name": "enhance",
|
|
"description": "Create enhanced zoom around a coordinate for readability.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"coordinate": {
|
|
"type": "object",
|
|
"properties": {
|
|
"x": {"type": "integer"},
|
|
"y": {"type": "integer"},
|
|
},
|
|
"required": ["x", "y"],
|
|
"additionalProperties": False,
|
|
}
|
|
},
|
|
"required": ["coordinate"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"name": "type",
|
|
"description": "Type literal text into the active focused element.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {"text": {"type": "string"}},
|
|
"required": ["text"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"name": "press_key",
|
|
"description": "Press a specific key (enter, tab, esc, arrows, etc).",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"key": {"type": "string"},
|
|
"repeats": {"type": "integer", "minimum": 1},
|
|
},
|
|
"required": ["key"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
{
|
|
"type": "function",
|
|
"name": "click",
|
|
"description": (
|
|
"Click absolute screen coordinate with simple directional offsets. "
|
|
"Use offset_up/down/left/right values like 2 or '2px'. "
|
|
"Optional sleep_after_seconds performs a pause immediately after click."
|
|
),
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"coordinate": {
|
|
"type": "object",
|
|
"properties": {
|
|
"x": {"type": "integer"},
|
|
"y": {"type": "integer"},
|
|
},
|
|
"required": ["x", "y"],
|
|
"additionalProperties": False,
|
|
},
|
|
"offset": {
|
|
"type": "object",
|
|
"properties": {
|
|
"x": {"type": "integer"},
|
|
"y": {"type": "integer"},
|
|
},
|
|
"required": [],
|
|
"additionalProperties": False,
|
|
},
|
|
"offset_up": {"type": ["integer", "string"]},
|
|
"offset_down": {"type": ["integer", "string"]},
|
|
"offset_left": {"type": ["integer", "string"]},
|
|
"offset_right": {"type": ["integer", "string"]},
|
|
"sleep_after_seconds": {"type": ["number", "string"]},
|
|
},
|
|
"required": ["coordinate"],
|
|
"additionalProperties": False,
|
|
},
|
|
},
|
|
]
|
|
return [tool for tool in all_tools if tool["name"] not in self.disabled_tools]
|
|
|
|
def _capture_screen(self, with_grid: bool = True) -> tuple[Image.Image, dict[str, Any]]:
|
|
screenshot = pyautogui.screenshot().convert("RGB")
|
|
width, height = screenshot.size
|
|
image = draw_global_grid(screenshot) if with_grid else screenshot
|
|
meta = {
|
|
"width": width,
|
|
"height": height,
|
|
"captured_at": utc_now_iso(),
|
|
"grid": bool(with_grid),
|
|
}
|
|
return image, meta
|
|
|
|
def _save_image(self, image: Image.Image, path) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
image.save(path, format="PNG")
|
|
|
|
def _build_visual_message(self, title: str, data_url: str, meta: dict[str, Any]) -> dict[str, Any]:
|
|
text = (
|
|
f"{title}\n"
|
|
f"Metadata: {json.dumps(meta, ensure_ascii=False)}\n"
|
|
"Use coordinates from this image for click/enhance actions."
|
|
)
|
|
return {
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "input_text", "text": text},
|
|
{"type": "input_image", "image_url": data_url, "detail": "high"},
|
|
],
|
|
}
|
|
|
|
def _parse_px(self, value: Any) -> int:
|
|
if value is None:
|
|
return 0
|
|
if isinstance(value, bool):
|
|
return int(value)
|
|
if isinstance(value, int):
|
|
return value
|
|
if isinstance(value, float):
|
|
return int(round(value))
|
|
text = str(value).strip().lower()
|
|
if text.endswith("px"):
|
|
text = text[:-2].strip()
|
|
try:
|
|
return int(float(text))
|
|
except Exception: # noqa: BLE001
|
|
return 0
|
|
|
|
def _parse_seconds(self, value: Any, default: float = 0.0, max_seconds: float = 60.0) -> float:
|
|
if value is None:
|
|
return default
|
|
if isinstance(value, (int, float)):
|
|
sec = float(value)
|
|
else:
|
|
text = str(value).strip().lower()
|
|
if text.endswith("ms"):
|
|
try:
|
|
sec = float(text[:-2].strip()) / 1000.0
|
|
except Exception: # noqa: BLE001
|
|
sec = default
|
|
else:
|
|
if text.endswith("s"):
|
|
text = text[:-1].strip()
|
|
try:
|
|
sec = float(text)
|
|
except Exception: # noqa: BLE001
|
|
sec = default
|
|
if sec < 0:
|
|
sec = 0.0
|
|
if sec > max_seconds:
|
|
sec = max_seconds
|
|
return sec
|
|
|
|
def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]:
|
|
image, meta = self._capture_screen(with_grid=True)
|
|
out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png"
|
|
self._save_image(image, out_path)
|
|
data_url = image_to_data_url(image, "PNG")
|
|
|
|
self.last_screen_data_url = data_url
|
|
self.last_screen_meta = meta | {"path": str(out_path.resolve())}
|
|
return {
|
|
"ok": True,
|
|
"path": str(out_path.resolve()),
|
|
"meta": self.last_screen_meta,
|
|
"message": "Screen captured with coordinate grid.",
|
|
}
|
|
|
|
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
|
|
coord = args.get("coordinate") or {}
|
|
x = int(coord.get("x", 0))
|
|
y = int(coord.get("y", 0))
|
|
base, base_meta = self._capture_screen(with_grid=False)
|
|
width, height = base.size
|
|
|
|
region_half = 180
|
|
left = clamp(x - region_half, 0, width - 1)
|
|
top = clamp(y - region_half, 0, height - 1)
|
|
right = clamp(x + region_half, left + 1, width)
|
|
bottom = clamp(y + region_half, top + 1, height)
|
|
|
|
crop = base.crop((left, top, right, bottom))
|
|
upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC)
|
|
enhanced = ImageOps.autocontrast(upscaled)
|
|
enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0)
|
|
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
|
|
enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2))
|
|
|
|
out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png"
|
|
self._save_image(enhanced, out_path)
|
|
data_url = image_to_data_url(enhanced, "PNG")
|
|
|
|
meta = {
|
|
"captured_at": utc_now_iso(),
|
|
"source_coord": {"x": x, "y": y},
|
|
"source_box": {"left": left, "top": top, "right": right, "bottom": bottom},
|
|
"scale": 2,
|
|
"path": str(out_path.resolve()),
|
|
"screen_size": {"width": width, "height": height},
|
|
"base_capture_meta": base_meta,
|
|
}
|
|
self.last_screen_data_url = data_url
|
|
self.last_screen_meta = meta
|
|
return {"ok": True, "meta": meta, "message": "Enhanced view generated."}
|
|
|
|
def _tool_click(self, args: dict[str, Any]) -> dict[str, Any]:
|
|
coord = args.get("coordinate") or {}
|
|
offset = args.get("offset") or {}
|
|
base_x = int(coord.get("x", 0))
|
|
base_y = int(coord.get("y", 0))
|
|
|
|
legacy_dx = self._parse_px(offset.get("x", 0))
|
|
legacy_dy = self._parse_px(offset.get("y", 0))
|
|
up = self._parse_px(args.get("offset_up", 0))
|
|
down = self._parse_px(args.get("offset_down", 0))
|
|
left = self._parse_px(args.get("offset_left", 0))
|
|
right = self._parse_px(args.get("offset_right", 0))
|
|
|
|
x = base_x + legacy_dx + right - left
|
|
y = base_y + legacy_dy + down - up
|
|
width, height = pyautogui.size()
|
|
x = clamp(x, 0, max(0, width - 1))
|
|
y = clamp(y, 0, max(0, height - 1))
|
|
|
|
now = time.time()
|
|
self.click_history.append((x, y, now))
|
|
self.click_history = self.click_history[-20:]
|
|
very_recent = [(cx, cy, ts) for (cx, cy, ts) in self.click_history if now - ts <= 25]
|
|
near_same = [
|
|
(cx, cy, ts)
|
|
for (cx, cy, ts) in very_recent
|
|
if abs(cx - x) <= 6 and abs(cy - y) <= 6
|
|
]
|
|
if len(near_same) >= 4:
|
|
return {
|
|
"ok": False,
|
|
"blocked": True,
|
|
"error": (
|
|
"Repeated click loop detected at nearly same coordinate. "
|
|
"Switch strategy: call see_screen/enhance and use execute_command."
|
|
),
|
|
"clicked": {"x": x, "y": y},
|
|
"recent_similar_clicks": len(near_same),
|
|
}
|
|
|
|
pyautogui.moveTo(x, y, duration=self.options.click_pause)
|
|
pyautogui.click(x=x, y=y)
|
|
sleep_after = self._parse_seconds(args.get("sleep_after_seconds", 0), default=0.0, max_seconds=30.0)
|
|
wait_remaining = sleep_after if sleep_after > 0 else 0.15
|
|
while wait_remaining > 0:
|
|
if self._is_cancelled():
|
|
break
|
|
interval = min(0.05, wait_remaining)
|
|
time.sleep(interval)
|
|
wait_remaining -= interval
|
|
|
|
return {
|
|
"ok": True,
|
|
"clicked": {"x": x, "y": y},
|
|
"base_coordinate": {"x": base_x, "y": base_y},
|
|
"applied_offset": {
|
|
"legacy": {"x": legacy_dx, "y": legacy_dy},
|
|
"directional": {"up": up, "down": down, "left": left, "right": right},
|
|
},
|
|
"sleep_after_seconds": sleep_after,
|
|
"screen_size": {"width": width, "height": height},
|
|
"message": "Click executed.",
|
|
}
|
|
|
|
def _tool_type(self, args: dict[str, Any]) -> dict[str, Any]:
|
|
text = str(args.get("text", ""))
|
|
for char in text:
|
|
if self._is_cancelled():
|
|
return {"ok": False, "cancelled": True, "typed_length": 0}
|
|
pyautogui.write(char, interval=0)
|
|
time.sleep(self.options.type_interval)
|
|
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
|
|
|
|
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
|
|
key = str(args.get("key", "")).strip().lower()
|
|
repeats = max(1, int(args.get("repeats", 1)))
|
|
if not key:
|
|
return {"ok": False, "error": "Missing key."}
|
|
repeats = min(repeats, 50)
|
|
pressed = 0
|
|
for _ in range(repeats):
|
|
if self._is_cancelled():
|
|
break
|
|
pyautogui.press(key)
|
|
pressed += 1
|
|
time.sleep(0.03)
|
|
return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}
|
|
|
|
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
|
|
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
|
|
elapsed = 0.0
|
|
while elapsed < seconds:
|
|
if self._is_cancelled():
|
|
return {"ok": False, "cancelled": True, "slept_seconds": round(elapsed, 3)}
|
|
interval = min(0.1, seconds - elapsed)
|
|
time.sleep(interval)
|
|
elapsed += interval
|
|
return {"ok": True, "slept_seconds": round(seconds, 3), "message": "Sleep completed."}
|
|
|
|
def _tool_execute_command(self, args: dict[str, Any]) -> dict[str, Any]:
|
|
command = str(args.get("command", "")).strip()
|
|
if not command:
|
|
return {"ok": False, "error": "Empty command."}
|
|
|
|
started = time.time()
|
|
process: subprocess.Popen[str] | None = None
|
|
try:
|
|
process = subprocess.Popen(
|
|
command,
|
|
shell=True,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
text=True,
|
|
)
|
|
while True:
|
|
if self._is_cancelled():
|
|
process.terminate()
|
|
return {
|
|
"ok": False,
|
|
"cancelled": True,
|
|
"command": command,
|
|
"elapsed_ms": int((time.time() - started) * 1000),
|
|
}
|
|
if process.poll() is not None:
|
|
break
|
|
if (time.time() - started) > self.options.command_timeout:
|
|
process.kill()
|
|
stdout, stderr = process.communicate(timeout=2)
|
|
return {
|
|
"ok": False,
|
|
"command": command,
|
|
"error": "Command timed out.",
|
|
"elapsed_ms": int((time.time() - started) * 1000),
|
|
"timeout_seconds": self.options.command_timeout,
|
|
"stdout": (stdout or "")[-12000:],
|
|
"stderr": (stderr or "")[-12000:],
|
|
}
|
|
time.sleep(0.05)
|
|
|
|
stdout, stderr = process.communicate(timeout=2)
|
|
return {
|
|
"ok": True,
|
|
"command": command,
|
|
"exit_code": process.returncode,
|
|
"stdout": (stdout or "")[-12000:],
|
|
"stderr": (stderr or "")[-12000:],
|
|
"elapsed_ms": int((time.time() - started) * 1000),
|
|
}
|
|
except Exception as exc: # noqa: BLE001
|
|
if process is not None and process.poll() is None:
|
|
try:
|
|
process.kill()
|
|
except Exception: # noqa: BLE001
|
|
pass
|
|
return {"ok": False, "command": command, "error": f"{type(exc).__name__}: {exc}"}
|
|
|
|
def _tool_task_complete(self, args: dict[str, Any]) -> dict[str, Any]:
|
|
result = str(args.get("result", "")).strip() or "Task completed."
|
|
self.completed = True
|
|
self.final_result = result
|
|
return {"ok": True, "result": result}
|
|
|
|
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
|
|
if name in self.disabled_tools:
|
|
return {"ok": False, "error": f"Tool '{name}' is disabled for this job."}
|
|
handlers = {
|
|
"see_screen": self._tool_see_screen,
|
|
"enhance": self._tool_enhance,
|
|
"click": self._tool_click,
|
|
"type": self._tool_type,
|
|
"press_key": self._tool_press_key,
|
|
"sleep": self._tool_sleep,
|
|
"execute_command": self._tool_execute_command,
|
|
"task_complete": self._tool_task_complete,
|
|
}
|
|
handler = handlers.get(name)
|
|
if handler is None:
|
|
return {"ok": False, "error": f"Unknown tool: {name}"}
|
|
return handler(args)
|
|
|
|
def _safe_parse_args(self, raw: str | None) -> dict[str, Any]:
|
|
if not raw:
|
|
return {}
|
|
try:
|
|
parsed = json.loads(raw)
|
|
return parsed if isinstance(parsed, dict) else {"value": parsed}
|
|
except Exception: # noqa: BLE001
|
|
return {"_raw": raw}
|
|
|
|
def _call_model(self, input_items: list[dict[str, Any]]) -> Any:
|
|
return self.client.responses.create(
|
|
model=self.options.model,
|
|
instructions=SYSTEM_PROMPT,
|
|
tools=self._tool_schemas(),
|
|
input=input_items,
|
|
previous_response_id=self.previous_response_id,
|
|
parallel_tool_calls=True,
|
|
max_tool_calls=8,
|
|
)
|
|
|
|
def run(self, job: str) -> AgentResult:
|
|
started_at = time.time()
|
|
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model)
|
|
self.logger.info("Job: %s", job)
|
|
self.logger.info("Disabled tools: %s", sorted(self.disabled_tools))
|
|
self._emit(
|
|
"job_started",
|
|
{
|
|
"run_id": self.artifacts.run_id,
|
|
"model": self.options.model,
|
|
"objective": job,
|
|
"disabled_tools": sorted(self.disabled_tools),
|
|
},
|
|
)
|
|
|
|
self._tool_see_screen({})
|
|
init_input: list[dict[str, Any]] = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "input_text",
|
|
"text": (
|
|
f"JOB: {job}\n"
|
|
"You are in an action loop. Prefer execute_command for deterministic actions. "
|
|
"You can return multiple tool calls in one step (example: click then sleep). "
|
|
"Call task_complete(result=...) only when truly done."
|
|
),
|
|
}
|
|
],
|
|
}
|
|
]
|
|
if self.last_screen_data_url and self.last_screen_meta:
|
|
init_input.append(
|
|
self._build_visual_message("Initial screen capture", self.last_screen_data_url, self.last_screen_meta)
|
|
)
|
|
|
|
pending_input = init_input
|
|
error_text: str | None = None
|
|
cancelled = False
|
|
|
|
while self.step < self.options.max_steps and not self.completed:
|
|
if self._is_cancelled():
|
|
cancelled = True
|
|
error_text = "Cancelled by user request."
|
|
break
|
|
|
|
self.step += 1
|
|
self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps)
|
|
self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps})
|
|
try:
|
|
response = self._call_model(pending_input)
|
|
self._register_usage(response)
|
|
except Exception as exc: # noqa: BLE001
|
|
self.logger.exception("OpenAI API call failed on step %d", self.step)
|
|
error_text = f"OpenAI API call failed: {type(exc).__name__}: {exc}"
|
|
break
|
|
|
|
self.previous_response_id = response.id
|
|
output_items = list(response.output or [])
|
|
text_preview = getattr(response, "output_text", "") or ""
|
|
if text_preview.strip():
|
|
self.logger.info("Model text: %s", text_preview.strip()[:500])
|
|
self._emit("model_text", {"step": self.step, "text": text_preview.strip()[:2000]})
|
|
|
|
tool_calls = [item for item in output_items if getattr(item, "type", None) == "function_call"]
|
|
if not tool_calls:
|
|
self.logger.warning("No tool calls returned; nudging model to continue with tools.")
|
|
self._emit("step_warning", {"step": self.step, "message": "No tool calls; nudged model."})
|
|
pending_input = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "input_text",
|
|
"text": (
|
|
"No function call was returned. Continue by using tools. "
|
|
"You may call multiple tools in one step. "
|
|
"When complete, call task_complete(result=...)."
|
|
),
|
|
}
|
|
],
|
|
}
|
|
]
|
|
continue
|
|
|
|
next_input: list[dict[str, Any]] = []
|
|
for tool_call in tool_calls:
|
|
if self._is_cancelled():
|
|
cancelled = True
|
|
error_text = "Cancelled by user request."
|
|
break
|
|
|
|
name = str(getattr(tool_call, "name", ""))
|
|
call_id = str(getattr(tool_call, "call_id", ""))
|
|
args_raw = getattr(tool_call, "arguments", "{}")
|
|
args = self._safe_parse_args(args_raw)
|
|
|
|
self.logger.info("Tool call: %s args=%s", name, json.dumps(args, ensure_ascii=False))
|
|
self._emit(
|
|
"tool_called",
|
|
{"step": self.step, "tool": name, "args": args},
|
|
)
|
|
try:
|
|
result = self._dispatch_tool(name, args)
|
|
except Exception as exc: # noqa: BLE001
|
|
self.logger.exception("Tool execution failed: %s", name)
|
|
result = {
|
|
"ok": False,
|
|
"error": f"{type(exc).__name__}: {exc}",
|
|
"traceback": traceback.format_exc()[-8000:],
|
|
}
|
|
|
|
self.logger.debug(
|
|
"Tool result for %s: %s",
|
|
name,
|
|
json.dumps(result, ensure_ascii=False)[:2500],
|
|
)
|
|
self._emit("tool_result", {"step": self.step, "tool": name, "result": result})
|
|
next_input.append(
|
|
{
|
|
"type": "function_call_output",
|
|
"call_id": call_id,
|
|
"output": json.dumps(result, ensure_ascii=False),
|
|
}
|
|
)
|
|
|
|
if name in ("see_screen", "enhance") and self.last_screen_data_url and self.last_screen_meta:
|
|
title = "Updated screen capture" if name == "see_screen" else "Enhanced screen region"
|
|
next_input.append(
|
|
self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta)
|
|
)
|
|
self._emit(
|
|
"visual_update",
|
|
{
|
|
"step": self.step,
|
|
"kind": name,
|
|
"image_meta": self.last_screen_meta,
|
|
},
|
|
)
|
|
|
|
if cancelled:
|
|
break
|
|
pending_input = next_input
|
|
|
|
ended_at = time.time()
|
|
if self.completed:
|
|
self.logger.info("Task completed in %d step(s).", self.step)
|
|
self._emit("job_completed", {"result": self.final_result, "steps": self.step, "usage": self.usage.to_dict()})
|
|
return AgentResult(
|
|
completed=True,
|
|
result=self.final_result,
|
|
steps=self.step,
|
|
started_at=started_at,
|
|
ended_at=ended_at,
|
|
usage=self.usage,
|
|
)
|
|
|
|
if cancelled:
|
|
self.logger.warning("Run cancelled by user after %d step(s).", self.step)
|
|
self._emit("job_cancelled", {"steps": self.step, "usage": self.usage.to_dict()})
|
|
return AgentResult(
|
|
completed=False,
|
|
result="Cancelled by user request.",
|
|
steps=self.step,
|
|
started_at=started_at,
|
|
ended_at=ended_at,
|
|
usage=self.usage,
|
|
error=error_text,
|
|
cancelled=True,
|
|
)
|
|
|
|
if error_text:
|
|
self.logger.error("Run failed: %s", error_text)
|
|
self._emit("job_failed", {"steps": self.step, "error": error_text, "usage": self.usage.to_dict()})
|
|
return AgentResult(
|
|
completed=False,
|
|
result=error_text,
|
|
steps=self.step,
|
|
started_at=started_at,
|
|
ended_at=ended_at,
|
|
usage=self.usage,
|
|
error=error_text,
|
|
)
|
|
|
|
self.logger.warning("Stopped due to step limit (%d).", self.options.max_steps)
|
|
result_text = f"Stopped after max steps ({self.options.max_steps}) without task_complete."
|
|
self._emit("job_stopped", {"steps": self.step, "reason": "max_steps", "usage": self.usage.to_dict()})
|
|
return AgentResult(
|
|
completed=False,
|
|
result=result_text,
|
|
steps=self.step,
|
|
started_at=started_at,
|
|
ended_at=ended_at,
|
|
usage=self.usage,
|
|
error=result_text,
|
|
)
|