feat: add shared runtime with FastAPI job server and safety pipeline

2026-05-27 17:43:51 +02:00
parent 84b0df520c
commit 10355bf11a
14 changed files with 1516 additions and 157 deletions
--- a/src/agent.py
+++ b/src/agent.py
@@ -3,14 +3,16 @@ from __future__ import annotations
 import json
 import logging
 import subprocess
+import threading
 import time
 import traceback
-from typing import Any
+from typing import Any, Callable

 from openai import OpenAI
 from PIL import Image, ImageEnhance, ImageFilter, ImageOps

-from .models import AgentResult, RunArtifacts
+from .models import AgentResult, RunArtifacts, RuntimeOptions, UsageSummary
+from .pricing import estimate_cost_usd
 from .utils import clamp, draw_global_grid, image_to_data_url, utc_now_iso

 try:
@@ -46,32 +48,78 @@ class ScreenJobAgent:
        client: OpenAI,
        logger: logging.Logger,
        artifacts: RunArtifacts,
-        model: str,
-        max_steps: int,
-        command_timeout: int,
-        type_interval: float,
-        click_pause: float,
+        options: RuntimeOptions,
+        cancel_event: threading.Event | None = None,
+        event_callback: Callable[[dict[str, Any]], None] | None = None,
    ) -> None:
        self.client = client
        self.logger = logger
        self.artifacts = artifacts
-        self.model = model
-        self.max_steps = max_steps
-        self.command_timeout = command_timeout
-        self.type_interval = type_interval
-        self.click_pause = click_pause
+        self.options = options
+        self.cancel_event = cancel_event or threading.Event()
+        self.event_callback = event_callback

        self.step = 0
        self.completed = False
        self.final_result = ""
        self.previous_response_id: str | None = None
+        self.usage = UsageSummary()

        self.last_screen_data_url: str | None = None
        self.last_screen_meta: dict[str, Any] | None = None
        self.click_history: list[tuple[int, int, float]] = []
+        self.disabled_tools = {tool.strip() for tool in (options.disable_tools or set()) if tool.strip()}
+
+    def _emit(self, event_type: str, payload: dict[str, Any]) -> None:
+        if self.event_callback is None:
+            return
+        event = {
+            "ts": utc_now_iso(),
+            "job_run_id": self.artifacts.run_id,
+            "step": self.step,
+            "event_type": event_type,
+            "payload": payload,
+        }
+        try:
+            self.event_callback(event)
+        except Exception:  # noqa: BLE001
+            self.logger.debug("Event callback failed.", exc_info=True)
+
+    def _is_cancelled(self) -> bool:
+        return bool(self.cancel_event.is_set())
+
+    def _register_usage(self, response: Any) -> None:
+        usage_obj = getattr(response, "usage", None)
+        if usage_obj is None:
+            return
+        input_tokens = int(getattr(usage_obj, "input_tokens", 0) or 0)
+        output_tokens = int(getattr(usage_obj, "output_tokens", 0) or 0)
+        total_tokens = int(getattr(usage_obj, "total_tokens", input_tokens + output_tokens) or 0)
+
+        input_details = getattr(usage_obj, "input_tokens_details", None)
+        cached_tokens = int(getattr(input_details, "cached_tokens", 0) or 0) if input_details else 0
+        output_details = getattr(usage_obj, "output_tokens_details", None)
+        reasoning_tokens = int(getattr(output_details, "reasoning_tokens", 0) or 0) if output_details else 0
+
+        self.usage.input_tokens += input_tokens
+        self.usage.cached_input_tokens += cached_tokens
+        self.usage.output_tokens += output_tokens
+        self.usage.reasoning_tokens += reasoning_tokens
+        self.usage.total_tokens += total_tokens
+        estimated_cost, model_for_pricing = estimate_cost_usd(self.options.model, self.usage)
+        self.usage.estimated_cost_usd = estimated_cost
+        self.usage.model_for_pricing = model_for_pricing
+
+        self._emit(
+            "usage_update",
+            {
+                "usage": self.usage.to_dict(),
+                "response_id": getattr(response, "id", None),
+            },
+        )

    def _tool_schemas(self) -> list[dict[str, Any]]:
-        return [
+        all_tools: list[dict[str, Any]] = [
            {
                "type": "function",
                "name": "task_complete",
@@ -213,6 +261,7 @@ class ScreenJobAgent:
                },
            },
        ]
+        return [tool for tool in all_tools if tool["name"] not in self.disabled_tools]

    def _capture_screen(self, with_grid: bool = True) -> tuple[Image.Image, dict[str, Any]]:
        screenshot = pyautogui.screenshot().convert("RGB")
@@ -378,13 +427,16 @@ class ScreenJobAgent:
                "recent_similar_clicks": len(near_same),
            }

-        pyautogui.moveTo(x, y, duration=self.click_pause)
+        pyautogui.moveTo(x, y, duration=self.options.click_pause)
        pyautogui.click(x=x, y=y)
        sleep_after = self._parse_seconds(args.get("sleep_after_seconds", 0), default=0.0, max_seconds=30.0)
-        if sleep_after > 0:
-            time.sleep(sleep_after)
-        else:
-            time.sleep(0.15)
+        wait_remaining = sleep_after if sleep_after > 0 else 0.15
+        while wait_remaining > 0:
+            if self._is_cancelled():
+                break
+            interval = min(0.05, wait_remaining)
+            time.sleep(interval)
+            wait_remaining -= interval

        return {
            "ok": True,
@@ -401,7 +453,11 @@ class ScreenJobAgent:

    def _tool_type(self, args: dict[str, Any]) -> dict[str, Any]:
        text = str(args.get("text", ""))
-        pyautogui.write(text, interval=self.type_interval)
+        for char in text:
+            if self._is_cancelled():
+                return {"ok": False, "cancelled": True, "typed_length": 0}
+            pyautogui.write(char, interval=0)
+            time.sleep(self.options.type_interval)
        return {"ok": True, "typed_length": len(text), "message": "Text typed."}

    def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
@@ -410,15 +466,25 @@ class ScreenJobAgent:
        if not key:
            return {"ok": False, "error": "Missing key."}
        repeats = min(repeats, 50)
+        pressed = 0
        for _ in range(repeats):
+            if self._is_cancelled():
+                break
            pyautogui.press(key)
+            pressed += 1
            time.sleep(0.03)
-        return {"ok": True, "key": key, "repeats": repeats, "message": "Key press executed."}
+        return {"ok": True, "key": key, "repeats": pressed, "message": "Key press executed."}

    def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
        seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
-        time.sleep(seconds)
-        return {"ok": True, "slept_seconds": seconds, "message": "Sleep completed."}
+        elapsed = 0.0
+        while elapsed < seconds:
+            if self._is_cancelled():
+                return {"ok": False, "cancelled": True, "slept_seconds": round(elapsed, 3)}
+            interval = min(0.1, seconds - elapsed)
+            time.sleep(interval)
+            elapsed += interval
+        return {"ok": True, "slept_seconds": round(seconds, 3), "message": "Sleep completed."}

    def _tool_execute_command(self, args: dict[str, Any]) -> dict[str, Any]:
        command = str(args.get("command", "")).strip()
@@ -426,36 +492,55 @@ class ScreenJobAgent:
            return {"ok": False, "error": "Empty command."}

        started = time.time()
+        process: subprocess.Popen[str] | None = None
        try:
-            completed = subprocess.run(
+            process = subprocess.Popen(
                command,
                shell=True,
-                capture_output=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
                text=True,
-                timeout=self.command_timeout,
-                check=False,
            )
-            elapsed_ms = int((time.time() - started) * 1000)
+            while True:
+                if self._is_cancelled():
+                    process.terminate()
+                    return {
+                        "ok": False,
+                        "cancelled": True,
+                        "command": command,
+                        "elapsed_ms": int((time.time() - started) * 1000),
+                    }
+                if process.poll() is not None:
+                    break
+                if (time.time() - started) > self.options.command_timeout:
+                    process.kill()
+                    stdout, stderr = process.communicate(timeout=2)
+                    return {
+                        "ok": False,
+                        "command": command,
+                        "error": "Command timed out.",
+                        "elapsed_ms": int((time.time() - started) * 1000),
+                        "timeout_seconds": self.options.command_timeout,
+                        "stdout": (stdout or "")[-12000:],
+                        "stderr": (stderr or "")[-12000:],
+                    }
+                time.sleep(0.05)
+
+            stdout, stderr = process.communicate(timeout=2)
            return {
                "ok": True,
                "command": command,
-                "exit_code": completed.returncode,
-                "stdout": completed.stdout[-12000:],
-                "stderr": completed.stderr[-12000:],
-                "elapsed_ms": elapsed_ms,
-            }
-        except subprocess.TimeoutExpired as exc:
-            elapsed_ms = int((time.time() - started) * 1000)
-            return {
-                "ok": False,
-                "command": command,
-                "error": "Command timed out.",
-                "elapsed_ms": elapsed_ms,
-                "timeout_seconds": self.command_timeout,
-                "stdout": (exc.stdout or "")[-12000:],
-                "stderr": (exc.stderr or "")[-12000:],
+                "exit_code": process.returncode,
+                "stdout": (stdout or "")[-12000:],
+                "stderr": (stderr or "")[-12000:],
+                "elapsed_ms": int((time.time() - started) * 1000),
            }
        except Exception as exc:  # noqa: BLE001
+            if process is not None and process.poll() is None:
+                try:
+                    process.kill()
+                except Exception:  # noqa: BLE001
+                    pass
            return {"ok": False, "command": command, "error": f"{type(exc).__name__}: {exc}"}

    def _tool_task_complete(self, args: dict[str, Any]) -> dict[str, Any]:
@@ -465,6 +550,8 @@ class ScreenJobAgent:
        return {"ok": True, "result": result}

    def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
+        if name in self.disabled_tools:
+            return {"ok": False, "error": f"Tool '{name}' is disabled for this job."}
        handlers = {
            "see_screen": self._tool_see_screen,
            "enhance": self._tool_enhance,
@@ -491,7 +578,7 @@ class ScreenJobAgent:

    def _call_model(self, input_items: list[dict[str, Any]]) -> Any:
        return self.client.responses.create(
-            model=self.model,
+            model=self.options.model,
            instructions=SYSTEM_PROMPT,
            tools=self._tool_schemas(),
            input=input_items,
@@ -502,8 +589,18 @@ class ScreenJobAgent:

    def run(self, job: str) -> AgentResult:
        started_at = time.time()
-        self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.model)
+        self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.options.model)
        self.logger.info("Job: %s", job)
+        self.logger.info("Disabled tools: %s", sorted(self.disabled_tools))
+        self._emit(
+            "job_started",
+            {
+                "run_id": self.artifacts.run_id,
+                "model": self.options.model,
+                "objective": job,
+                "disabled_tools": sorted(self.disabled_tools),
+            },
+        )

        self._tool_see_screen({})
        init_input: list[dict[str, Any]] = [
@@ -528,25 +625,37 @@ class ScreenJobAgent:
            )

        pending_input = init_input
+        error_text: str | None = None
+        cancelled = False
+
+        while self.step < self.options.max_steps and not self.completed:
+            if self._is_cancelled():
+                cancelled = True
+                error_text = "Cancelled by user request."
+                break

-        while self.step < self.max_steps and not self.completed:
            self.step += 1
-            self.logger.info("---- Agent step %d/%d ----", self.step, self.max_steps)
+            self.logger.info("---- Agent step %d/%d ----", self.step, self.options.max_steps)
+            self._emit("step_started", {"step": self.step, "max_steps": self.options.max_steps})
            try:
                response = self._call_model(pending_input)
+                self._register_usage(response)
            except Exception as exc:  # noqa: BLE001
                self.logger.exception("OpenAI API call failed on step %d", self.step)
-                raise RuntimeError(f"OpenAI API call failed: {type(exc).__name__}: {exc}") from exc
+                error_text = f"OpenAI API call failed: {type(exc).__name__}: {exc}"
+                break

            self.previous_response_id = response.id
            output_items = list(response.output or [])
            text_preview = getattr(response, "output_text", "") or ""
            if text_preview.strip():
                self.logger.info("Model text: %s", text_preview.strip()[:500])
+                self._emit("model_text", {"step": self.step, "text": text_preview.strip()[:2000]})

            tool_calls = [item for item in output_items if getattr(item, "type", None) == "function_call"]
            if not tool_calls:
                self.logger.warning("No tool calls returned; nudging model to continue with tools.")
+                self._emit("step_warning", {"step": self.step, "message": "No tool calls; nudged model."})
                pending_input = [
                    {
                        "role": "user",
@@ -566,12 +675,21 @@ class ScreenJobAgent:

            next_input: list[dict[str, Any]] = []
            for tool_call in tool_calls:
+                if self._is_cancelled():
+                    cancelled = True
+                    error_text = "Cancelled by user request."
+                    break
+
                name = str(getattr(tool_call, "name", ""))
                call_id = str(getattr(tool_call, "call_id", ""))
                args_raw = getattr(tool_call, "arguments", "{}")
                args = self._safe_parse_args(args_raw)

                self.logger.info("Tool call: %s args=%s", name, json.dumps(args, ensure_ascii=False))
+                self._emit(
+                    "tool_called",
+                    {"step": self.step, "tool": name, "args": args},
+                )
                try:
                    result = self._dispatch_tool(name, args)
                except Exception as exc:  # noqa: BLE001
@@ -587,6 +705,7 @@ class ScreenJobAgent:
                    name,
                    json.dumps(result, ensure_ascii=False)[:2500],
                )
+                self._emit("tool_result", {"step": self.step, "tool": name, "result": result})
                next_input.append(
                    {
                        "type": "function_call_output",
@@ -600,26 +719,69 @@ class ScreenJobAgent:
                    next_input.append(
                        self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta)
                    )
+                    self._emit(
+                        "visual_update",
+                        {
+                            "step": self.step,
+                            "kind": name,
+                            "image_meta": self.last_screen_meta,
+                        },
+                    )

+            if cancelled:
+                break
            pending_input = next_input

        ended_at = time.time()
        if self.completed:
            self.logger.info("Task completed in %d step(s).", self.step)
+            self._emit("job_completed", {"result": self.final_result, "steps": self.step, "usage": self.usage.to_dict()})
            return AgentResult(
                completed=True,
                result=self.final_result,
                steps=self.step,
                started_at=started_at,
                ended_at=ended_at,
+                usage=self.usage,
            )

-        self.logger.warning("Stopped due to step limit (%d).", self.max_steps)
+        if cancelled:
+            self.logger.warning("Run cancelled by user after %d step(s).", self.step)
+            self._emit("job_cancelled", {"steps": self.step, "usage": self.usage.to_dict()})
+            return AgentResult(
+                completed=False,
+                result="Cancelled by user request.",
+                steps=self.step,
+                started_at=started_at,
+                ended_at=ended_at,
+                usage=self.usage,
+                error=error_text,
+                cancelled=True,
+            )
+
+        if error_text:
+            self.logger.error("Run failed: %s", error_text)
+            self._emit("job_failed", {"steps": self.step, "error": error_text, "usage": self.usage.to_dict()})
+            return AgentResult(
+                completed=False,
+                result=error_text,
+                steps=self.step,
+                started_at=started_at,
+                ended_at=ended_at,
+                usage=self.usage,
+                error=error_text,
+            )
+
+        self.logger.warning("Stopped due to step limit (%d).", self.options.max_steps)
+        result_text = f"Stopped after max steps ({self.options.max_steps}) without task_complete."
+        self._emit("job_stopped", {"steps": self.step, "reason": "max_steps", "usage": self.usage.to_dict()})
        return AgentResult(
            completed=False,
-            result=f"Stopped after max steps ({self.max_steps}) without task_complete.",
+            result=result_text,
            steps=self.step,
            started_at=started_at,
            ended_at=ended_at,
+            usage=self.usage,
+            error=result_text,
        )