feat: add shared runtime with FastAPI job server and safety pipeline

This commit is contained in:
Space-Banane
2026-05-27 17:43:51 +02:00
parent 84b0df520c
commit 10355bf11a
14 changed files with 1516 additions and 157 deletions

View File

@@ -3,14 +3,16 @@ from __future__ import annotations
import json
import logging
import subprocess
import threading
import time
import traceback
from typing import Any
from typing import Any, Callable
from openai import OpenAI
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
from .models import AgentResult, RunArtifacts
from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary
from .pricing import estimate_cost_usd
from .utils import clamp, draw_global_grid, image_to_data_url, utc_now_iso
try:
@@ -46,32 +48,78 @@ class ScreenJobAgent:
client: OpenAI,
logger: logging.Logger,
artifacts: RunArtifacts,
model: str,
max_steps: int,
command_timeout: int,
type_interval: float,
click_pause: float,
options: RuntimeOptions,
cancel_event: threading.Event | None = None,
event_callback: Callable[[dict[str, Any]], None] | None = None,
) -> None:
self.client = client
self.logger = logger
self.artifacts = artifacts
self.model = model
self.max_steps = max_steps
self.command_timeout = command_timeout
self.type_interval = type_interval
self.click_pause = click_pause
self.options = options
self.cancel_event = cancel_event or threading.Event()
self.event_callback = event_callback
self.step = 0
self.completed = False
self.final_result = ""
self.previous_response_id: str | None = None
self.usage = UsageSummary()
self.last_screen_data_url: str | None = None
self.last_screen_meta: dict[str, Any] | None = None
self.click_history: list[tuple[int, int, float]] = []
self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
if self.event_callback is None:
return
event = {
"ts": utc_now_iso(),
"job_run_id": self.artifacts.run_id,
"step": self.step,
"event_type": event_type,
"payload": payload,
}
try:
self.event_callback(event)
except Exception: # noqa: BLE001
self.logger.debug("Event callback failed.", exc_info=True)
def _is_cancelled(self) -> bool:
return bool(self.cancel_event.is_set())
def _register_usage(self, response: Any) -> None:
usage_obj = getattr(response, "usage", None)
if usage_obj is None:
return
input_tokens = int(getattr(usage_obj, "input_tokens", 0) or 0)
output_tokens = int(getattr(usage_obj, "output_tokens", 0) or 0)
total_tokens = int(getattr(usage_obj, "total_tokens", input_tokens + output_tokens) or 0)
input_details = getattr(usage_obj, "input_tokens_details", None)
cached_tokens = int(getattr(input_details, "cached_tokens", 0) or 0) if input_details else 0
output_details = getattr(usage_obj, "output_tokens_details", None)
reasoning_tokens = int(getattr(output_details, "reasoning_tokens", 0) or 0) if output_details else 0
self.usage.input_tokens += input_tokens
self.usage.cached_input_tokens += cached_tokens
self.usage.output_tokens += output_tokens
self.usage.reasoning_tokens += reasoning_tokens
self.usage.total_tokens += total_tokens
estimated_cost, model_for_pricing = estimate_cost_usd(self.options.model, self.usage)
self.usage.estimated_cost_usd = estimated_cost
self.usage.model_for_pricing = model_for_pricing
self._emit(
"usage_update",
{
"usage": self.usage.to_dict(),
"response_id": getattr(response, "id", None),
},
)
def _tool_schemas(self) -> list[dict[str, Any]]:
return [
all_tools: list[dict[str, Any]] = [
{
"type": "function",
"name": "task_complete",
@@ -213,6 +261,7 @@ class ScreenJobAgent:
},
},
]
return [tool for tool in all_tools if tool["name"] not in self.disabled_tools]
def _capture_screen(self, with_grid: bool = True) -> tuple[Image.Image, dict[str, Any]]:
screenshot = pyautogui.screenshot().convert("RGB")
@@ -378,13 +427,16 @@ class ScreenJobAgent:
"recent_similar_clicks": len(near_same),
}
pyautogui.moveTo(x, y, duration=self.click_pause)
pyautogui.moveTo(x, y, duration=self.options.click_pause)
pyautogui.click(x=x, y=y)
sleep_after = self._parse_seconds(args.get("sleep_after_seconds", 0), default=0.0, max_seconds=30.0)
if sleep_after > 0:
time.sleep(sleep_after)
else:
time.sleep(0.15)
wait_remaining = sleep_after if sleep_after > 0 else 0.15
while wait_remaining > 0:
if self._is_cancelled():
break
interval = min(0.05, wait_remaining)
time.sleep(interval)
wait_remaining -= interval
return {
"ok": True,
@@ -401,7 +453,11 @@ class ScreenJobAgent:
def _tool_type(self, args: dict[str, Any]) -> dict[str, Any]:
text = str(args.get("text", ""))
pyautogui.write(text, interval=self.type_interval)
for char in text:
if self._is_cancelled():
return {"ok": False, "cancelled": True, "typed_length": 0}
pyautogui.write(char, interval=0)
time.sleep(self.options.type_interval)
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
@@ -410,15 +466,25 @@ class ScreenJobAgent:
if not key:
return {"ok": False, "error": "Missing key."}
repeats = min(repeats, 50)
pressed = 0
for _ in range(repeats):
if self._is_cancelled():
break
pyautogui.press(key)
pressed += 1
time.sleep(0.03)
return {"ok": True, "key": key, "repeats": repeats, "message": "Key press executed."}
return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
time.sleep(seconds)
return {"ok": True, "slept_seconds": seconds, "message": "Sleep completed."}
elapsed = 0.0
while elapsed < seconds:
if self._is_cancelled():
return {"ok": False, "cancelled": True, "slept_seconds": round(elapsed, 3)}
interval = min(0.1, seconds - elapsed)
time.sleep(interval)
elapsed += interval
return {"ok": True, "slept_seconds": round(seconds, 3), "message": "Sleep completed."}
def _tool_execute_command(self, args: dict[str, Any]) -> dict[str, Any]:
command = str(args.get("command", "")).strip()
@@ -426,36 +492,55 @@ class ScreenJobAgent:
return {"ok": False, "error": "Empty command."}
started = time.time()
process: subprocess.Popen[str] | None = None
try:
completed = subprocess.run(
process = subprocess.Popen(
command,
shell=True,
capture_output=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=self.command_timeout,
check=False,
)
elapsed_ms = int((time.time() - started) * 1000)
while True:
if self._is_cancelled():
process.terminate()
return {
"ok": False,
"cancelled": True,
"command": command,
"elapsed_ms": int((time.time() - started) * 1000),
}
if process.poll() is not None:
break
if (time.time() - started) > self.options.command_timeout:
process.kill()
stdout, stderr = process.communicate(timeout=2)
return {
"ok": False,
"command": command,
"error": "Command timed out.",
"elapsed_ms": int((time.time() - started) * 1000),
"timeout_seconds": self.options.command_timeout,
"stdout": (stdout or "")[-12000:],
"stderr": (stderr or "")[-12000:],
}
time.sleep(0.05)
stdout, stderr = process.communicate(timeout=2)
return {
"ok": True,
"command": command,
"exit_code": completed.returncode,
"stdout": completed.stdout[-12000:],
"stderr": completed.stderr[-12000:],
"elapsed_ms": elapsed_ms,
}
except subprocess.TimeoutExpired as exc:
elapsed_ms = int((time.time() - started) * 1000)
return {
"ok": False,
"command": command,
"error": "Command timed out.",
"elapsed_ms": elapsed_ms,
"timeout_seconds": self.command_timeout,
"stdout": (exc.stdout or "")[-12000:],
"stderr": (exc.stderr or "")[-12000:],
"exit_code": process.returncode,
"stdout": (stdout or "")[-12000:],
"stderr": (stderr or "")[-12000:],
"elapsed_ms": int((time.time() - started) * 1000),
}
except Exception as exc: # noqa: BLE001
if process is not None and process.poll() is None:
try:
process.kill()
except Exception: # noqa: BLE001
pass
return {"ok": False, "command": command, "error": f"{type(exc).__name__}: {exc}"}
def _tool_task_complete(self, args: dict[str, Any]) -> dict[str, Any]:
@@ -465,6 +550,8 @@ class ScreenJobAgent:
return {"ok": True, "result": result}
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
if name in self.disabled_tools:
return {"ok": False, "error": f"Tool '{name}' is disabled for this job."}
handlers = {
"see_screen": self._tool_see_screen,
"enhance": self._tool_enhance,
@@ -491,7 +578,7 @@ class ScreenJobAgent:
def _call_model(self, input_items: list[dict[str, Any]]) -> Any:
return self.client.responses.create(
model=self.model,
model=self.options.model,
instructions=SYSTEM_PROMPT,
tools=self._tool_schemas(),
input=input_items,
@@ -502,8 +589,18 @@ class ScreenJobAgent:
def run(self, job: str) -> AgentResult:
started_at = time.time()
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.model)
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model)
self.logger.info("Job: %s", job)
self.logger.info("Disabled tools: %s", sorted(self.disabled_tools))
self._emit(
"job_started",
{
"run_id": self.artifacts.run_id,
"model": self.options.model,
"objective": job,
"disabled_tools": sorted(self.disabled_tools),
},
)
self._tool_see_screen({})
init_input: list[dict[str, Any]] = [
@@ -528,25 +625,37 @@ class ScreenJobAgent:
)
pending_input = init_input
error_text: str | None = None
cancelled = False
while self.step < self.options.max_steps and not self.completed:
if self._is_cancelled():
cancelled = True
error_text = "Cancelled by user request."
break
while self.step < self.max_steps and not self.completed:
self.step += 1
self.logger.info("---- Agent step %d/%d ----", self.step, self.max_steps)
self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps)
self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps})
try:
response = self._call_model(pending_input)
self._register_usage(response)
except Exception as exc: # noqa: BLE001
self.logger.exception("OpenAI API call failed on step %d", self.step)
raise RuntimeError(f"OpenAI API call failed: {type(exc).__name__}: {exc}") from exc
error_text = f"OpenAI API call failed: {type(exc).__name__}: {exc}"
break
self.previous_response_id = response.id
output_items = list(response.output or [])
text_preview = getattr(response, "output_text", "") or ""
if text_preview.strip():
self.logger.info("Model text: %s", text_preview.strip()[:500])
self._emit("model_text", {"step": self.step, "text": text_preview.strip()[:2000]})
tool_calls = [item for item in output_items if getattr(item, "type", None) == "function_call"]
if not tool_calls:
self.logger.warning("No tool calls returned; nudging model to continue with tools.")
self._emit("step_warning", {"step": self.step, "message": "No tool calls; nudged model."})
pending_input = [
{
"role": "user",
@@ -566,12 +675,21 @@ class ScreenJobAgent:
next_input: list[dict[str, Any]] = []
for tool_call in tool_calls:
if self._is_cancelled():
cancelled = True
error_text = "Cancelled by user request."
break
name = str(getattr(tool_call, "name", ""))
call_id = str(getattr(tool_call, "call_id", ""))
args_raw = getattr(tool_call, "arguments", "{}")
args = self._safe_parse_args(args_raw)
self.logger.info("Tool call: %s args=%s", name, json.dumps(args, ensure_ascii=False))
self._emit(
"tool_called",
{"step": self.step, "tool": name, "args": args},
)
try:
result = self._dispatch_tool(name, args)
except Exception as exc: # noqa: BLE001
@@ -587,6 +705,7 @@ class ScreenJobAgent:
name,
json.dumps(result, ensure_ascii=False)[:2500],
)
self._emit("tool_result", {"step": self.step, "tool": name, "result": result})
next_input.append(
{
"type": "function_call_output",
@@ -600,26 +719,69 @@ class ScreenJobAgent:
next_input.append(
self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta)
)
self._emit(
"visual_update",
{
"step": self.step,
"kind": name,
"image_meta": self.last_screen_meta,
},
)
if cancelled:
break
pending_input = next_input
ended_at = time.time()
if self.completed:
self.logger.info("Task completed in %d step(s).", self.step)
self._emit("job_completed", {"result": self.final_result, "steps": self.step, "usage": self.usage.to_dict()})
return AgentResult(
completed=True,
result=self.final_result,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
usage=self.usage,
)
self.logger.warning("Stopped due to step limit (%d).", self.max_steps)
if cancelled:
self.logger.warning("Run cancelled by user after %d step(s).", self.step)
self._emit("job_cancelled", {"steps": self.step, "usage": self.usage.to_dict()})
return AgentResult(
completed=False,
result="Cancelled by user request.",
steps=self.step,
started_at=started_at,
ended_at=ended_at,
usage=self.usage,
error=error_text,
cancelled=True,
)
if error_text:
self.logger.error("Run failed: %s", error_text)
self._emit("job_failed", {"steps": self.step, "error": error_text, "usage": self.usage.to_dict()})
return AgentResult(
completed=False,
result=error_text,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
usage=self.usage,
error=error_text,
)
self.logger.warning("Stopped due to step limit (%d).", self.options.max_steps)
result_text = f"Stopped after max steps ({self.options.max_steps}) without task_complete."
self._emit("job_stopped", {"steps": self.step, "reason": "max_steps", "usage": self.usage.to_dict()})
return AgentResult(
completed=False,
result=f"Stopped after max steps ({self.max_steps}) without task_complete.",
result=result_text,
steps=self.step,
started_at=started_at,
ended_at=ended_at,
usage=self.usage,
error=result_text,
)