chore: initialize screenjob project baseline

This commit is contained in:
Space-Banane
2026-05-27 17:31:49 +02:00
commit 84b0df520c
9 changed files with 1045 additions and 0 deletions

625
src/agent.py Normal file
View File

@@ -0,0 +1,625 @@
from __future__ import annotations
import json
import logging
import subprocess
import time
import traceback
from typing import Any
from openai import OpenAI
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
from .models import AgentResult, RunArtifacts
from .utils import clamp, draw_global_grid, image_to_data_url, utc_now_iso
try:
import pyautogui
except Exception as import_exc:
raise RuntimeError(
"pyautogui is required. Install dependencies with: pip install pyautogui pillow"
) from import_exc
SYSTEM_PROMPT = """
You are ScreenJob, an autonomous desktop-and-terminal task executor.
Rules:
1) Use tools to act. Do not claim actions without tool calls.
2) Prefer execute_command for deterministic actions:
- opening URLs/websites (Windows: start https://amazon.de)
- launching apps or running terminal checks
3) For UI tasks, inspect with see_screen before clicking/typing.
4) Coordinates are absolute screen pixels (x, y) from top-left.
5) Use enhance(coordinate) when text/UI is unclear.
6) For keyboard-heavy interactions, prefer press_key for special keys.
7) You may call multiple tools in one step. If needed, do click then sleep.
8) Never spam repeated clicks on the same coordinate; switch strategy.
9) Keep tool arguments valid JSON and concise.
10) When objective is fully complete, call task_complete(result="...").
"""
class ScreenJobAgent:
def __init__(
self,
client: OpenAI,
logger: logging.Logger,
artifacts: RunArtifacts,
model: str,
max_steps: int,
command_timeout: int,
type_interval: float,
click_pause: float,
) -> None:
self.client = client
self.logger = logger
self.artifacts = artifacts
self.model = model
self.max_steps = max_steps
self.command_timeout = command_timeout
self.type_interval = type_interval
self.click_pause = click_pause
self.step = 0
self.completed = False
self.final_result = ""
self.previous_response_id: str | None = None
self.last_screen_data_url: str | None = None
self.last_screen_meta: dict[str, Any] | None = None
self.click_history: list[tuple[int, int, float]] = []
def _tool_schemas(self) -> list[dict[str, Any]]:
return [
{
"type": "function",
"name": "task_complete",
"description": "Call this when the job objective is fully done.",
"parameters": {
"type": "object",
"properties": {
"result": {"type": "string"},
},
"required": ["result"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "execute_command",
"description": (
"Run a shell command and return stdout/stderr/exit code. "
"Prefer this for deterministic operations like opening URLs."
),
"parameters": {
"type": "object",
"properties": {
"command": {"type": "string"},
},
"required": ["command"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "sleep",
"description": (
"Pause execution for a short time. "
"Use this instead of shell sleep commands."
),
"parameters": {
"type": "object",
"properties": {
"seconds": {"type": ["number", "string"]},
},
"required": ["seconds"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "see_screen",
"description": "Capture full screen with coordinate grid overlay.",
"parameters": {
"type": "object",
"properties": {},
"additionalProperties": False,
},
},
{
"type": "function",
"name": "enhance",
"description": "Create enhanced zoom around a coordinate for readability.",
"parameters": {
"type": "object",
"properties": {
"coordinate": {
"type": "object",
"properties": {
"x": {"type": "integer"},
"y": {"type": "integer"},
},
"required": ["x", "y"],
"additionalProperties": False,
}
},
"required": ["coordinate"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "type",
"description": "Type literal text into the active focused element.",
"parameters": {
"type": "object",
"properties": {"text": {"type": "string"}},
"required": ["text"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "press_key",
"description": "Press a specific key (enter, tab, esc, arrows, etc).",
"parameters": {
"type": "object",
"properties": {
"key": {"type": "string"},
"repeats": {"type": "integer", "minimum": 1},
},
"required": ["key"],
"additionalProperties": False,
},
},
{
"type": "function",
"name": "click",
"description": (
"Click absolute screen coordinate with simple directional offsets. "
"Use offset_up/down/left/right values like 2 or '2px'. "
"Optional sleep_after_seconds performs a pause immediately after click."
),
"parameters": {
"type": "object",
"properties": {
"coordinate": {
"type": "object",
"properties": {
"x": {"type": "integer"},
"y": {"type": "integer"},
},
"required": ["x", "y"],
"additionalProperties": False,
},
"offset": {
"type": "object",
"properties": {
"x": {"type": "integer"},
"y": {"type": "integer"},
},
"required": [],
"additionalProperties": False,
},
"offset_up": {"type": ["integer", "string"]},
"offset_down": {"type": ["integer", "string"]},
"offset_left": {"type": ["integer", "string"]},
"offset_right": {"type": ["integer", "string"]},
"sleep_after_seconds": {"type": ["number", "string"]},
},
"required": ["coordinate"],
"additionalProperties": False,
},
},
]
def _capture_screen(self, with_grid: bool = True) -> tuple[Image.Image, dict[str, Any]]:
screenshot = pyautogui.screenshot().convert("RGB")
width, height = screenshot.size
image = draw_global_grid(screenshot) if with_grid else screenshot
meta = {
"width": width,
"height": height,
"captured_at": utc_now_iso(),
"grid": bool(with_grid),
}
return image, meta
def _save_image(self, image: Image.Image, path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
image.save(path, format="PNG")
def _build_visual_message(self, title: str, data_url: str, meta: dict[str, Any]) -> dict[str, Any]:
text = (
f"{title}\n"
f"Metadata: {json.dumps(meta, ensure_ascii=False)}\n"
"Use coordinates from this image for click/enhance actions."
)
return {
"role": "user",
"content": [
{"type": "input_text", "text": text},
{"type": "input_image", "image_url": data_url, "detail": "high"},
],
}
def _parse_px(self, value: Any) -> int:
if value is None:
return 0
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(round(value))
text = str(value).strip().lower()
if text.endswith("px"):
text = text[:-2].strip()
try:
return int(float(text))
except Exception: # noqa: BLE001
return 0
def _parse_seconds(self, value: Any, default: float = 0.0, max_seconds: float = 60.0) -> float:
if value is None:
return default
if isinstance(value, (int, float)):
sec = float(value)
else:
text = str(value).strip().lower()
if text.endswith("ms"):
try:
sec = float(text[:-2].strip()) / 1000.0
except Exception: # noqa: BLE001
sec = default
else:
if text.endswith("s"):
text = text[:-1].strip()
try:
sec = float(text)
except Exception: # noqa: BLE001
sec = default
if sec < 0:
sec = 0.0
if sec > max_seconds:
sec = max_seconds
return sec
def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]:
image, meta = self._capture_screen(with_grid=True)
out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png"
self._save_image(image, out_path)
data_url = image_to_data_url(image, "PNG")
self.last_screen_data_url = data_url
self.last_screen_meta = meta | {"path": str(out_path.resolve())}
return {
"ok": True,
"path": str(out_path.resolve()),
"meta": self.last_screen_meta,
"message": "Screen captured with coordinate grid.",
}
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
coord = args.get("coordinate") or {}
x = int(coord.get("x", 0))
y = int(coord.get("y", 0))
base, base_meta = self._capture_screen(with_grid=False)
width, height = base.size
region_half = 180
left = clamp(x - region_half, 0, width - 1)
top = clamp(y - region_half, 0, height - 1)
right = clamp(x + region_half, left + 1, width)
bottom = clamp(y + region_half, top + 1, height)
crop = base.crop((left, top, right, bottom))
upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC)
enhanced = ImageOps.autocontrast(upscaled)
enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0)
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2))
out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png"
self._save_image(enhanced, out_path)
data_url = image_to_data_url(enhanced, "PNG")
meta = {
"captured_at": utc_now_iso(),
"source_coord": {"x": x, "y": y},
"source_box": {"left": left, "top": top, "right": right, "bottom": bottom},
"scale": 2,
"path": str(out_path.resolve()),
"screen_size": {"width": width, "height": height},
"base_capture_meta": base_meta,
}
self.last_screen_data_url = data_url
self.last_screen_meta = meta
return {"ok": True, "meta": meta, "message": "Enhanced view generated."}
def _tool_click(self, args: dict[str, Any]) -> dict[str, Any]:
coord = args.get("coordinate") or {}
offset = args.get("offset") or {}
base_x = int(coord.get("x", 0))
base_y = int(coord.get("y", 0))
legacy_dx = self._parse_px(offset.get("x", 0))
legacy_dy = self._parse_px(offset.get("y", 0))
up = self._parse_px(args.get("offset_up", 0))
down = self._parse_px(args.get("offset_down", 0))
left = self._parse_px(args.get("offset_left", 0))
right = self._parse_px(args.get("offset_right", 0))
x = base_x + legacy_dx + right - left
y = base_y + legacy_dy + down - up
width, height = pyautogui.size()
x = clamp(x, 0, max(0, width - 1))
y = clamp(y, 0, max(0, height - 1))
now = time.time()
self.click_history.append((x, y, now))
self.click_history = self.click_history[-20:]
very_recent = [(cx, cy, ts) for (cx, cy, ts) in self.click_history if now - ts <= 25]
near_same = [
(cx, cy, ts)
for (cx, cy, ts) in very_recent
if abs(cx - x) <= 6 and abs(cy - y) <= 6
]
if len(near_same) >= 4:
return {
"ok": False,
"blocked": True,
"error": (
"Repeated click loop detected at nearly same coordinate. "
"Switch strategy: call see_screen/enhance and use execute_command."
),
"clicked": {"x": x, "y": y},
"recent_similar_clicks": len(near_same),
}
pyautogui.moveTo(x, y, duration=self.click_pause)
pyautogui.click(x=x, y=y)
sleep_after = self._parse_seconds(args.get("sleep_after_seconds", 0), default=0.0, max_seconds=30.0)
if sleep_after > 0:
time.sleep(sleep_after)
else:
time.sleep(0.15)
return {
"ok": True,
"clicked": {"x": x, "y": y},
"base_coordinate": {"x": base_x, "y": base_y},
"applied_offset": {
"legacy": {"x": legacy_dx, "y": legacy_dy},
"directional": {"up": up, "down": down, "left": left, "right": right},
},
"sleep_after_seconds": sleep_after,
"screen_size": {"width": width, "height": height},
"message": "Click executed.",
}
def _tool_type(self, args: dict[str, Any]) -> dict[str, Any]:
text = str(args.get("text", ""))
pyautogui.write(text, interval=self.type_interval)
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
key = str(args.get("key", "")).strip().lower()
repeats = max(1, int(args.get("repeats", 1)))
if not key:
return {"ok": False, "error": "Missing key."}
repeats = min(repeats, 50)
for _ in range(repeats):
pyautogui.press(key)
time.sleep(0.03)
return {"ok": True, "key": key, "repeats": repeats, "message": "Key press executed."}
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
time.sleep(seconds)
return {"ok": True, "slept_seconds": seconds, "message": "Sleep completed."}
def _tool_execute_command(self, args: dict[str, Any]) -> dict[str, Any]:
command = str(args.get("command", "")).strip()
if not command:
return {"ok": False, "error": "Empty command."}
started = time.time()
try:
completed = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
timeout=self.command_timeout,
check=False,
)
elapsed_ms = int((time.time() - started) * 1000)
return {
"ok": True,
"command": command,
"exit_code": completed.returncode,
"stdout": completed.stdout[-12000:],
"stderr": completed.stderr[-12000:],
"elapsed_ms": elapsed_ms,
}
except subprocess.TimeoutExpired as exc:
elapsed_ms = int((time.time() - started) * 1000)
return {
"ok": False,
"command": command,
"error": "Command timed out.",
"elapsed_ms": elapsed_ms,
"timeout_seconds": self.command_timeout,
"stdout": (exc.stdout or "")[-12000:],
"stderr": (exc.stderr or "")[-12000:],
}
except Exception as exc: # noqa: BLE001
return {"ok": False, "command": command, "error": f"{type(exc).__name__}: {exc}"}
def _tool_task_complete(self, args: dict[str, Any]) -> dict[str, Any]:
result = str(args.get("result", "")).strip() or "Task completed."
self.completed = True
self.final_result = result
return {"ok": True, "result": result}
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
handlers = {
"see_screen": self._tool_see_screen,
"enhance": self._tool_enhance,
"click": self._tool_click,
"type": self._tool_type,
"press_key": self._tool_press_key,
"sleep": self._tool_sleep,
"execute_command": self._tool_execute_command,
"task_complete": self._tool_task_complete,
}
handler = handlers.get(name)
if handler is None:
return {"ok": False, "error": f"Unknown tool: {name}"}
return handler(args)
def _safe_parse_args(self, raw: str | None) -> dict[str, Any]:
if not raw:
return {}
try:
parsed = json.loads(raw)
return parsed if isinstance(parsed, dict) else {"value": parsed}
except Exception: # noqa: BLE001
return {"_raw": raw}
def _call_model(self, input_items: list[dict[str, Any]]) -> Any:
return self.client.responses.create(
model=self.model,
instructions=SYSTEM_PROMPT,
tools=self._tool_schemas(),
input=input_items,
previous_response_id=self.previous_response_id,
parallel_tool_calls=True,
max_tool_calls=8,
)
def run(self, job: str) -> AgentResult:
started_at = time.time()
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.model)
self.logger.info("Job: %s", job)
self._tool_see_screen({})
init_input: list[dict[str, Any]] = [
{
"role": "user",
"content": [
{
"type": "input_text",
"text": (
f"JOB: {job}\n"
"You are in an action loop. Prefer execute_command for deterministic actions. "
"You can return multiple tool calls in one step (example: click then sleep). "
"Call task_complete(result=...) only when truly done."
),
}
],
}
]
if self.last_screen_data_url and self.last_screen_meta:
init_input.append(
self._build_visual_message("Initial screen capture", self.last_screen_data_url, self.last_screen_meta)
)
pending_input = init_input
while self.step < self.max_steps and not self.completed:
self.step += 1
self.logger.info("---- Agent step %d/%d ----", self.step, self.max_steps)
try:
response = self._call_model(pending_input)
except Exception as exc: # noqa: BLE001
self.logger.exception("OpenAI API call failed on step %d", self.step)
raise RuntimeError(f"OpenAI API call failed: {type(exc).__name__}: {exc}") from exc
self.previous_response_id = response.id
output_items = list(response.output or [])
text_preview = getattr(response, "output_text", "") or ""
if text_preview.strip():
self.logger.info("Model text: %s", text_preview.strip()[:500])
tool_calls = [item for item in output_items if getattr(item, "type", None) == "function_call"]
if not tool_calls:
self.logger.warning("No tool calls returned; nudging model to continue with tools.")
pending_input = [
{
"role": "user",
"content": [
{
"type": "input_text",
"text": (
"No function call was returned. Continue by using tools. "
"You may call multiple tools in one step. "
"When complete, call task_complete(result=...)."
),
}
],
}
]
continue
next_input: list[dict[str, Any]] = []
for tool_call in tool_calls:
name = str(getattr(tool_call, "name", ""))
call_id = str(getattr(tool_call, "call_id", ""))
args_raw = getattr(tool_call, "arguments", "{}")
args = self._safe_parse_args(args_raw)
self.logger.info("Tool call: %s args=%s", name, json.dumps(args, ensure_ascii=False))
try:
result = self._dispatch_tool(name, args)
except Exception as exc: # noqa: BLE001
self.logger.exception("Tool execution failed: %s", name)
result = {
"ok": False,
"error": f"{type(exc).__name__}: {exc}",
"traceback": traceback.format_exc()[-8000:],
}
self.logger.debug(
"Tool result for %s: %s",
name,
json.dumps(result, ensure_ascii=False)[:2500],
)
next_input.append(
{
"type": "function_call_output",
"call_id": call_id,
"output": json.dumps(result, ensure_ascii=False),
}
)
if name in ("see_screen", "enhance") and self.last_screen_data_url and self.last_screen_meta:
title = "Updated screen capture" if name == "see_screen" else "Enhanced screen region"
next_input.append(
self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta)
)
pending_input = next_input
ended_at = time.time()
if self.completed:
self.logger.info("Task completed in %d step(s).", self.step)
return AgentResult(
completed=True,
result=self.final_result,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
)
self.logger.warning("Stopped due to step limit (%d).", self.max_steps)
return AgentResult(
completed=False,
result=f"Stopped after max steps ({self.max_steps}) without task_complete.",
steps=self.step,
started_at=started_at,
ended_at=ended_at,
)