Files
screenjob/src/agent.py
Space-Banane a19b285232
All checks were successful
CI / test (push) Successful in 48s
test: add pytest verification suite and gitea ci workflow
2026-05-27 17:55:34 +02:00

794 lines
31 KiB
Python

from __future__ import annotations
import json
import logging
import subprocess
import threading
import time
import traceback
from typing import Any, Callable
from openai import OpenAI
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary
from .pricing import estimate_cost_usd
from .utils import clamp, draw_global_grid, image_to_data_url, utc_now_iso
try:
import pyautogui
except Exception as import_exc:
pyautogui = None # type: ignore[assignment]
_PYAUTOGUI_IMPORT_ERROR = import_exc
else:
_PYAUTOGUI_IMPORT_ERROR = None
SYSTEM_PROMPT = """
You are ScreenJob, an autonomous desktop-and-terminal task executor.
Rules:
1) Use tools to act. Do not claim actions without tool calls.
2) Prefer execute_command for deterministic actions:
- opening URLs/websites (Windows: start https://amazon.de)
- launching apps or running terminal checks
3) For UI tasks, inspect with see_screen before clicking/typing.
4) Coordinates are absolute screen pixels (x, y) from top-left.
5) Use enhance(coordinate) when text/UI is unclear.
6) For keyboard-heavy interactions, prefer press_key for special keys.
7) You may call multiple tools in one step. If needed, do click then sleep.
8) Never spam repeated clicks on the same coordinate; switch strategy.
9) Keep tool arguments valid JSON and concise.
10) When objective is fully complete, call task_complete(result="...").
"""
class ScreenJobAgent:
def __init__(
self,
client: OpenAI,
logger: logging.Logger,
artifacts: RunArtifacts,
options: RuntimeOptions,
cancel_event: threading.Event | None = None,
event_callback: Callable[[dict[str, Any]], None] | None = None,
) -> None:
if pyautogui is None:
raise RuntimeError(
"pyautogui is required for agent execution. "
"Install dependencies and ensure GUI access. "
f"Import error: {_PYAUTOGUI_IMPORT_ERROR}"
)
self.client = client
self.logger = logger
self.artifacts = artifacts
self.options = options
self.cancel_event = cancel_event or threading.Event()
self.event_callback = event_callback
self.step = 0
self.completed = False
self.final_result = ""
self.previous_response_id: str | None = None
self.usage = UsageSummary()
self.last_screen_data_url: str | None = None
self.last_screen_meta: dict[str, Any] | None = None
self.click_history: list[tuple[int, int, float]] = []
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
if self.event_callback is None:
return
event = {
"ts": utc_now_iso(),
"job_run_id": self.artifacts.run_id,
"step": self.step,
"event_type": event_type,
"payload": payload,
}
try:
self.event_callback(event)
except Exception: # noqa: BLE001
self.logger.debug("Event callback failed.", exc_info=True)
def _is_cancelled(self) -> bool:
return bool(self.cancel_event.is_set())
def _register_usage(self, response: Any) -> None:
usage_obj = getattr(response, "usage", None)
if usage_obj is None:
return
input_tokens = int(getattr(usage_obj, "input_tokens", 0) or 0)
output_tokens = int(getattr(usage_obj, "output_tokens", 0) or 0)
total_tokens = int(getattr(usage_obj, "total_tokens", input_tokens + output_tokens) or 0)
input_details = getattr(usage_obj, "input_tokens_details", None)
cached_tokens = int(getattr(input_details, "cached_tokens", 0) or 0) if input_details else 0
output_details = getattr(usage_obj, "output_tokens_details", None)
reasoning_tokens = int(getattr(output_details, "reasoning_tokens", 0) or 0) if output_details else 0
self.usage.input_tokens += input_tokens
self.usage.cached_input_tokens += cached_tokens
self.usage.output_tokens += output_tokens
self.usage.reasoning_tokens += reasoning_tokens
self.usage.total_tokens += total_tokens
estimated_cost, model_for_pricing = estimate_cost_usd(self.options.model, self.usage)
self.usage.estimated_cost_usd = estimated_cost
self.usage.model_for_pricing = model_for_pricing
self._emit(
"usage_update",
{
"usage": self.usage.to_dict(),
"response_id": getattr(response, "id", None),
},
)
def _tool_schemas(self) -> list[dict[str, Any]]:
all_tools: list[dict[str, Any]] = [
{
"type": "function",
"name": "task_complete",
"description": "Call this when the job objective is fully done.",
"parameters": {
"type": "object",
"properties": {
"result": {"type": "string"},
},
"required": ["result"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "execute_command",
"description": (
"Run a shell command and return stdout/stderr/exit code. "
"Prefer this for deterministic operations like opening URLs."
),
"parameters": {
"type": "object",
"properties": {
"command": {"type": "string"},
},
"required": ["command"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "sleep",
"description": (
"Pause execution for a short time. "
"Use this instead of shell sleep commands."
),
"parameters": {
"type": "object",
"properties": {
"seconds": {"type": ["number", "string"]},
},
"required": ["seconds"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "see_screen",
"description": "Capture full screen with coordinate grid overlay.",
"parameters": {
"type": "object",
"properties": {},
"additionalProperties": False,
},
},
{
"type": "function",
"name": "enhance",
"description": "Create enhanced zoom around a coordinate for readability.",
"parameters": {
"type": "object",
"properties": {
"coordinate": {
"type": "object",
"properties": {
"x": {"type": "integer"},
"y": {"type": "integer"},
},
"required": ["x", "y"],
"additionalProperties": False,
}
},
"required": ["coordinate"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "type",
"description": "Type literal text into the active focused element.",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "press_key",
"description": "Press a specific key (enter, tab, esc, arrows, etc).",
"parameters": {
"type": "object",
"properties": {
"key": {"type": "string"},
"repeats": {"type": "integer", "minimum": 1},
},
"required": ["key"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "click",
"description": (
"Click absolute screen coordinate with simple directional offsets. "
"Use offset_up/down/left/right values like 2 or '2px'. "
"Optional sleep_after_seconds performs a pause immediately after click."
),
"parameters": {
"type": "object",
"properties": {
"coordinate": {
"type": "object",
"properties": {
"x": {"type": "integer"},
"y": {"type": "integer"},
},
"required": ["x", "y"],
"additionalProperties": False,
},
"offset": {
"type": "object",
"properties": {
"x": {"type": "integer"},
"y": {"type": "integer"},
},
"required": [],
"additionalProperties": False,
},
"offset_up": {"type": ["integer", "string"]},
"offset_down": {"type": ["integer", "string"]},
"offset_left": {"type": ["integer", "string"]},
"offset_right": {"type": ["integer", "string"]},
"sleep_after_seconds": {"type": ["number", "string"]},
},
"required": ["coordinate"],
"additionalProperties": False,
},
},
]
return [tool for tool in all_tools if tool["name"] not in self.disabled_tools]
def _capture_screen(self, with_grid: bool = True) -> tuple[Image.Image, dict[str, Any]]:
screenshot = pyautogui.screenshot().convert("RGB")
width, height = screenshot.size
image = draw_global_grid(screenshot) if with_grid else screenshot
meta = {
"width": width,
"height": height,
"captured_at": utc_now_iso(),
"grid": bool(with_grid),
}
return image, meta
def _save_image(self, image: Image.Image, path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
image.save(path, format="PNG")
def _build_visual_message(self, title: str, data_url: str, meta: dict[str, Any]) -> dict[str, Any]:
text = (
f"{title}\n"
f"Metadata: {json.dumps(meta, ensure_ascii=False)}\n"
"Use coordinates from this image for click/enhance actions."
)
return {
"role": "user",
"content": [
{"type": "input_text", "text": text},
{"type": "input_image", "image_url": data_url, "detail": "high"},
],
}
def _parse_px(self, value: Any) -> int:
if value is None:
return 0
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(round(value))
text = str(value).strip().lower()
if text.endswith("px"):
text = text[:-2].strip()
try:
return int(float(text))
except Exception: # noqa: BLE001
return 0
def _parse_seconds(self, value: Any, default: float = 0.0, max_seconds: float = 60.0) -> float:
if value is None:
return default
if isinstance(value, (int, float)):
sec = float(value)
else:
text = str(value).strip().lower()
if text.endswith("ms"):
try:
sec = float(text[:-2].strip()) / 1000.0
except Exception: # noqa: BLE001
sec = default
else:
if text.endswith("s"):
text = text[:-1].strip()
try:
sec = float(text)
except Exception: # noqa: BLE001
sec = default
if sec < 0:
sec = 0.0
if sec > max_seconds:
sec = max_seconds
return sec
def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]:
image, meta = self._capture_screen(with_grid=True)
out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png"
self._save_image(image, out_path)
data_url = image_to_data_url(image, "PNG")
self.last_screen_data_url = data_url
self.last_screen_meta = meta | {"path": str(out_path.resolve())}
return {
"ok": True,
"path": str(out_path.resolve()),
"meta": self.last_screen_meta,
"message": "Screen captured with coordinate grid.",
}
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
coord = args.get("coordinate") or {}
x = int(coord.get("x", 0))
y = int(coord.get("y", 0))
base, base_meta = self._capture_screen(with_grid=False)
width, height = base.size
region_half = 180
left = clamp(x - region_half, 0, width - 1)
top = clamp(y - region_half, 0, height - 1)
right = clamp(x + region_half, left + 1, width)
bottom = clamp(y + region_half, top + 1, height)
crop = base.crop((left, top, right, bottom))
upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC)
enhanced = ImageOps.autocontrast(upscaled)
enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0)
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2))
out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png"
self._save_image(enhanced, out_path)
data_url = image_to_data_url(enhanced, "PNG")
meta = {
"captured_at": utc_now_iso(),
"source_coord": {"x": x, "y": y},
"source_box": {"left": left, "top": top, "right": right, "bottom": bottom},
"scale": 2,
"path": str(out_path.resolve()),
"screen_size": {"width": width, "height": height},
"base_capture_meta": base_meta,
}
self.last_screen_data_url = data_url
self.last_screen_meta = meta
return {"ok": True, "meta": meta, "message": "Enhanced view generated."}
def _tool_click(self, args: dict[str, Any]) -> dict[str, Any]:
coord = args.get("coordinate") or {}
offset = args.get("offset") or {}
base_x = int(coord.get("x", 0))
base_y = int(coord.get("y", 0))
legacy_dx = self._parse_px(offset.get("x", 0))
legacy_dy = self._parse_px(offset.get("y", 0))
up = self._parse_px(args.get("offset_up", 0))
down = self._parse_px(args.get("offset_down", 0))
left = self._parse_px(args.get("offset_left", 0))
right = self._parse_px(args.get("offset_right", 0))
x = base_x + legacy_dx + right - left
y = base_y + legacy_dy + down - up
width, height = pyautogui.size()
x = clamp(x, 0, max(0, width - 1))
y = clamp(y, 0, max(0, height - 1))
now = time.time()
self.click_history.append((x, y, now))
self.click_history = self.click_history[-20:]
very_recent = [(cx, cy, ts) for (cx, cy, ts) in self.click_history if now - ts <= 25]
near_same = [
(cx, cy, ts)
for (cx, cy, ts) in very_recent
if abs(cx - x) <= 6 and abs(cy - y) <= 6
]
if len(near_same) >= 4:
return {
"ok": False,
"blocked": True,
"error": (
"Repeated click loop detected at nearly same coordinate. "
"Switch strategy: call see_screen/enhance and use execute_command."
),
"clicked": {"x": x, "y": y},
"recent_similar_clicks": len(near_same),
}
pyautogui.moveTo(x, y, duration=self.options.click_pause)
pyautogui.click(x=x, y=y)
sleep_after = self._parse_seconds(args.get("sleep_after_seconds", 0), default=0.0, max_seconds=30.0)
wait_remaining = sleep_after if sleep_after > 0 else 0.15
while wait_remaining > 0:
if self._is_cancelled():
break
interval = min(0.05, wait_remaining)
time.sleep(interval)
wait_remaining -= interval
return {
"ok": True,
"clicked": {"x": x, "y": y},
"base_coordinate": {"x": base_x, "y": base_y},
"applied_offset": {
"legacy": {"x": legacy_dx, "y": legacy_dy},
"directional": {"up": up, "down": down, "left": left, "right": right},
},
"sleep_after_seconds": sleep_after,
"screen_size": {"width": width, "height": height},
"message": "Click executed.",
}
def _tool_type(self, args: dict[str, Any]) -> dict[str, Any]:
text = str(args.get("text", ""))
for char in text:
if self._is_cancelled():
return {"ok": False, "cancelled": True, "typed_length": 0}
pyautogui.write(char, interval=0)
time.sleep(self.options.type_interval)
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
key = str(args.get("key", "")).strip().lower()
repeats = max(1, int(args.get("repeats", 1)))
if not key:
return {"ok": False, "error": "Missing key."}
repeats = min(repeats, 50)
pressed = 0
for _ in range(repeats):
if self._is_cancelled():
break
pyautogui.press(key)
pressed += 1
time.sleep(0.03)
return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
elapsed = 0.0
while elapsed < seconds:
if self._is_cancelled():
return {"ok": False, "cancelled": True, "slept_seconds": round(elapsed, 3)}
interval = min(0.1, seconds - elapsed)
time.sleep(interval)
elapsed += interval
return {"ok": True, "slept_seconds": round(seconds, 3), "message": "Sleep completed."}
def _tool_execute_command(self, args: dict[str, Any]) -> dict[str, Any]:
command = str(args.get("command", "")).strip()
if not command:
return {"ok": False, "error": "Empty command."}
started = time.time()
process: subprocess.Popen[str] | None = None
try:
process = subprocess.Popen(
command,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
while True:
if self._is_cancelled():
process.terminate()
return {
"ok": False,
"cancelled": True,
"command": command,
"elapsed_ms": int((time.time() - started) * 1000),
}
if process.poll() is not None:
break
if (time.time() - started) > self.options.command_timeout:
process.kill()
stdout, stderr = process.communicate(timeout=2)
return {
"ok": False,
"command": command,
"error": "Command timed out.",
"elapsed_ms": int((time.time() - started) * 1000),
"timeout_seconds": self.options.command_timeout,
"stdout": (stdout or "")[-12000:],
"stderr": (stderr or "")[-12000:],
}
time.sleep(0.05)
stdout, stderr = process.communicate(timeout=2)
return {
"ok": True,
"command": command,
"exit_code": process.returncode,
"stdout": (stdout or "")[-12000:],
"stderr": (stderr or "")[-12000:],
"elapsed_ms": int((time.time() - started) * 1000),
}
except Exception as exc: # noqa: BLE001
if process is not None and process.poll() is None:
try:
process.kill()
except Exception: # noqa: BLE001
pass
return {"ok": False, "command": command, "error": f"{type(exc).__name__}: {exc}"}
def _tool_task_complete(self, args: dict[str, Any]) -> dict[str, Any]:
result = str(args.get("result", "")).strip() or "Task completed."
self.completed = True
self.final_result = result
return {"ok": True, "result": result}
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
if name in self.disabled_tools:
return {"ok": False, "error": f"Tool '{name}' is disabled for this job."}
handlers = {
"see_screen": self._tool_see_screen,
"enhance": self._tool_enhance,
"click": self._tool_click,
"type": self._tool_type,
"press_key": self._tool_press_key,
"sleep": self._tool_sleep,
"execute_command": self._tool_execute_command,
"task_complete": self._tool_task_complete,
}
handler = handlers.get(name)
if handler is None:
return {"ok": False, "error": f"Unknown tool: {name}"}
return handler(args)
def _safe_parse_args(self, raw: str | None) -> dict[str, Any]:
if not raw:
return {}
try:
parsed = json.loads(raw)
return parsed if isinstance(parsed, dict) else {"value": parsed}
except Exception: # noqa: BLE001
return {"_raw": raw}
def _call_model(self, input_items: list[dict[str, Any]]) -> Any:
return self.client.responses.create(
model=self.options.model,
instructions=SYSTEM_PROMPT,
tools=self._tool_schemas(),
input=input_items,
previous_response_id=self.previous_response_id,
parallel_tool_calls=True,
max_tool_calls=8,
)
def run(self, job: str) -> AgentResult:
started_at = time.time()
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model)
self.logger.info("Job: %s", job)
self.logger.info("Disabled tools: %s", sorted(self.disabled_tools))
self._emit(
"job_started",
{
"run_id": self.artifacts.run_id,
"model": self.options.model,
"objective": job,
"disabled_tools": sorted(self.disabled_tools),
},
)
self._tool_see_screen({})
init_input: list[dict[str, Any]] = [
{
"role": "user",
"content": [
{
"type": "input_text",
"text": (
f"JOB: {job}\n"
"You are in an action loop. Prefer execute_command for deterministic actions. "
"You can return multiple tool calls in one step (example: click then sleep). "
"Call task_complete(result=...) only when truly done."
),
}
],
}
]
if self.last_screen_data_url and self.last_screen_meta:
init_input.append(
self._build_visual_message("Initial screen capture", self.last_screen_data_url, self.last_screen_meta)
)
pending_input = init_input
error_text: str | None = None
cancelled = False
while self.step < self.options.max_steps and not self.completed:
if self._is_cancelled():
cancelled = True
error_text = "Cancelled by user request."
break
self.step += 1
self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps)
self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps})
try:
response = self._call_model(pending_input)
self._register_usage(response)
except Exception as exc: # noqa: BLE001
self.logger.exception("OpenAI API call failed on step %d", self.step)
error_text = f"OpenAI API call failed: {type(exc).__name__}: {exc}"
break
self.previous_response_id = response.id
output_items = list(response.output or [])
text_preview = getattr(response, "output_text", "") or ""
if text_preview.strip():
self.logger.info("Model text: %s", text_preview.strip()[:500])
self._emit("model_text", {"step": self.step, "text": text_preview.strip()[:2000]})
tool_calls = [item for item in output_items if getattr(item, "type", None) == "function_call"]
if not tool_calls:
self.logger.warning("No tool calls returned; nudging model to continue with tools.")
self._emit("step_warning", {"step": self.step, "message": "No tool calls; nudged model."})
pending_input = [
{
"role": "user",
"content": [
{
"type": "input_text",
"text": (
"No function call was returned. Continue by using tools. "
"You may call multiple tools in one step. "
"When complete, call task_complete(result=...)."
),
}
],
}
]
continue
next_input: list[dict[str, Any]] = []
for tool_call in tool_calls:
if self._is_cancelled():
cancelled = True
error_text = "Cancelled by user request."
break
name = str(getattr(tool_call, "name", ""))
call_id = str(getattr(tool_call, "call_id", ""))
args_raw = getattr(tool_call, "arguments", "{}")
args = self._safe_parse_args(args_raw)
self.logger.info("Tool call: %s args=%s", name, json.dumps(args, ensure_ascii=False))
self._emit(
"tool_called",
{"step": self.step, "tool": name, "args": args},
)
try:
result = self._dispatch_tool(name, args)
except Exception as exc: # noqa: BLE001
self.logger.exception("Tool execution failed: %s", name)
result = {
"ok": False,
"error": f"{type(exc).__name__}: {exc}",
"traceback": traceback.format_exc()[-8000:],
}
self.logger.debug(
"Tool result for %s: %s",
name,
json.dumps(result, ensure_ascii=False)[:2500],
)
self._emit("tool_result", {"step": self.step, "tool": name, "result": result})
next_input.append(
{
"type": "function_call_output",
"call_id": call_id,
"output": json.dumps(result, ensure_ascii=False),
}
)
if name in ("see_screen", "enhance") and self.last_screen_data_url and self.last_screen_meta:
title = "Updated screen capture" if name == "see_screen" else "Enhanced screen region"
next_input.append(
self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta)
)
self._emit(
"visual_update",
{
"step": self.step,
"kind": name,
"image_meta": self.last_screen_meta,
},
)
if cancelled:
break
pending_input = next_input
ended_at = time.time()
if self.completed:
self.logger.info("Task completed in %d step(s).", self.step)
self._emit("job_completed", {"result": self.final_result, "steps": self.step, "usage": self.usage.to_dict()})
return AgentResult(
completed=True,
result=self.final_result,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
usage=self.usage,
)
if cancelled:
self.logger.warning("Run cancelled by user after %d step(s).", self.step)
self._emit("job_cancelled", {"steps": self.step, "usage": self.usage.to_dict()})
return AgentResult(
completed=False,
result="Cancelled by user request.",
steps=self.step,
started_at=started_at,
ended_at=ended_at,
usage=self.usage,
error=error_text,
cancelled=True,
)
if error_text:
self.logger.error("Run failed: %s", error_text)
self._emit("job_failed", {"steps": self.step, "error": error_text, "usage": self.usage.to_dict()})
return AgentResult(
completed=False,
result=error_text,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
usage=self.usage,
error=error_text,
)
self.logger.warning("Stopped due to step limit (%d).", self.options.max_steps)
result_text = f"Stopped after max steps ({self.options.max_steps}) without task_complete."
self._emit("job_stopped", {"steps": self.step, "reason": "max_steps", "usage": self.usage.to_dict()})
return AgentResult(
completed=False,
result=result_text,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
usage=self.usage,
error=result_text,
)