chore: initialize screenjob project baseline
This commit is contained in:
625
src/agent.py
Normal file
625
src/agent.py
Normal file
@@ -0,0 +1,625 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import time
|
||||
import traceback
|
||||
from typing import Any
|
||||
|
||||
from openai import OpenAI
|
||||
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
|
||||
|
||||
from .models import AgentResult, RunArtifacts
|
||||
from .utils import clamp, draw_global_grid, image_to_data_url, utc_now_iso
|
||||
|
||||
try:
|
||||
import pyautogui
|
||||
except Exception as import_exc:
|
||||
raise RuntimeError(
|
||||
"pyautogui is required. Install dependencies with: pip install pyautogui pillow"
|
||||
) from import_exc
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """
|
||||
You are ScreenJob, an autonomous desktop-and-terminal task executor.
|
||||
|
||||
Rules:
|
||||
1) Use tools to act. Do not claim actions without tool calls.
|
||||
2) Prefer execute_command for deterministic actions:
|
||||
- opening URLs/websites (Windows: start https://amazon.de)
|
||||
- launching apps or running terminal checks
|
||||
3) For UI tasks, inspect with see_screen before clicking/typing.
|
||||
4) Coordinates are absolute screen pixels (x, y) from top-left.
|
||||
5) Use enhance(coordinate) when text/UI is unclear.
|
||||
6) For keyboard-heavy interactions, prefer press_key for special keys.
|
||||
7) You may call multiple tools in one step. If needed, do click then sleep.
|
||||
8) Never spam repeated clicks on the same coordinate; switch strategy.
|
||||
9) Keep tool arguments valid JSON and concise.
|
||||
10) When objective is fully complete, call task_complete(result="...").
|
||||
"""
|
||||
|
||||
|
||||
class ScreenJobAgent:
|
||||
def __init__(
|
||||
self,
|
||||
client: OpenAI,
|
||||
logger: logging.Logger,
|
||||
artifacts: RunArtifacts,
|
||||
model: str,
|
||||
max_steps: int,
|
||||
command_timeout: int,
|
||||
type_interval: float,
|
||||
click_pause: float,
|
||||
) -> None:
|
||||
self.client = client
|
||||
self.logger = logger
|
||||
self.artifacts = artifacts
|
||||
self.model = model
|
||||
self.max_steps = max_steps
|
||||
self.command_timeout = command_timeout
|
||||
self.type_interval = type_interval
|
||||
self.click_pause = click_pause
|
||||
|
||||
self.step = 0
|
||||
self.completed = False
|
||||
self.final_result = ""
|
||||
self.previous_response_id: str | None = None
|
||||
|
||||
self.last_screen_data_url: str | None = None
|
||||
self.last_screen_meta: dict[str, Any] | None = None
|
||||
self.click_history: list[tuple[int, int, float]] = []
|
||||
|
||||
def _tool_schemas(self) -> list[dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"type": "function",
|
||||
"name": "task_complete",
|
||||
"description": "Call this when the job objective is fully done.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"result": {"type": "string"},
|
||||
},
|
||||
"required": ["result"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "execute_command",
|
||||
"description": (
|
||||
"Run a shell command and return stdout/stderr/exit code. "
|
||||
"Prefer this for deterministic operations like opening URLs."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {"type": "string"},
|
||||
},
|
||||
"required": ["command"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "sleep",
|
||||
"description": (
|
||||
"Pause execution for a short time. "
|
||||
"Use this instead of shell sleep commands."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"seconds": {"type": ["number", "string"]},
|
||||
},
|
||||
"required": ["seconds"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "see_screen",
|
||||
"description": "Capture full screen with coordinate grid overlay.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "enhance",
|
||||
"description": "Create enhanced zoom around a coordinate for readability.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"coordinate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"x": {"type": "integer"},
|
||||
"y": {"type": "integer"},
|
||||
},
|
||||
"required": ["x", "y"],
|
||||
"additionalProperties": False,
|
||||
}
|
||||
},
|
||||
"required": ["coordinate"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "type",
|
||||
"description": "Type literal text into the active focused element.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {"text": {"type": "string"}},
|
||||
"required": ["text"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "press_key",
|
||||
"description": "Press a specific key (enter, tab, esc, arrows, etc).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"key": {"type": "string"},
|
||||
"repeats": {"type": "integer", "minimum": 1},
|
||||
},
|
||||
"required": ["key"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"name": "click",
|
||||
"description": (
|
||||
"Click absolute screen coordinate with simple directional offsets. "
|
||||
"Use offset_up/down/left/right values like 2 or '2px'. "
|
||||
"Optional sleep_after_seconds performs a pause immediately after click."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"coordinate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"x": {"type": "integer"},
|
||||
"y": {"type": "integer"},
|
||||
},
|
||||
"required": ["x", "y"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"offset": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"x": {"type": "integer"},
|
||||
"y": {"type": "integer"},
|
||||
},
|
||||
"required": [],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"offset_up": {"type": ["integer", "string"]},
|
||||
"offset_down": {"type": ["integer", "string"]},
|
||||
"offset_left": {"type": ["integer", "string"]},
|
||||
"offset_right": {"type": ["integer", "string"]},
|
||||
"sleep_after_seconds": {"type": ["number", "string"]},
|
||||
},
|
||||
"required": ["coordinate"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
def _capture_screen(self, with_grid: bool = True) -> tuple[Image.Image, dict[str, Any]]:
|
||||
screenshot = pyautogui.screenshot().convert("RGB")
|
||||
width, height = screenshot.size
|
||||
image = draw_global_grid(screenshot) if with_grid else screenshot
|
||||
meta = {
|
||||
"width": width,
|
||||
"height": height,
|
||||
"captured_at": utc_now_iso(),
|
||||
"grid": bool(with_grid),
|
||||
}
|
||||
return image, meta
|
||||
|
||||
def _save_image(self, image: Image.Image, path) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
image.save(path, format="PNG")
|
||||
|
||||
def _build_visual_message(self, title: str, data_url: str, meta: dict[str, Any]) -> dict[str, Any]:
|
||||
text = (
|
||||
f"{title}\n"
|
||||
f"Metadata: {json.dumps(meta, ensure_ascii=False)}\n"
|
||||
"Use coordinates from this image for click/enhance actions."
|
||||
)
|
||||
return {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "input_text", "text": text},
|
||||
{"type": "input_image", "image_url": data_url, "detail": "high"},
|
||||
],
|
||||
}
|
||||
|
||||
def _parse_px(self, value: Any) -> int:
|
||||
if value is None:
|
||||
return 0
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(round(value))
|
||||
text = str(value).strip().lower()
|
||||
if text.endswith("px"):
|
||||
text = text[:-2].strip()
|
||||
try:
|
||||
return int(float(text))
|
||||
except Exception: # noqa: BLE001
|
||||
return 0
|
||||
|
||||
def _parse_seconds(self, value: Any, default: float = 0.0, max_seconds: float = 60.0) -> float:
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, (int, float)):
|
||||
sec = float(value)
|
||||
else:
|
||||
text = str(value).strip().lower()
|
||||
if text.endswith("ms"):
|
||||
try:
|
||||
sec = float(text[:-2].strip()) / 1000.0
|
||||
except Exception: # noqa: BLE001
|
||||
sec = default
|
||||
else:
|
||||
if text.endswith("s"):
|
||||
text = text[:-1].strip()
|
||||
try:
|
||||
sec = float(text)
|
||||
except Exception: # noqa: BLE001
|
||||
sec = default
|
||||
if sec < 0:
|
||||
sec = 0.0
|
||||
if sec > max_seconds:
|
||||
sec = max_seconds
|
||||
return sec
|
||||
|
||||
def _tool_see_screen(self, _: dict[str, Any]) -> dict[str, Any]:
|
||||
image, meta = self._capture_screen(with_grid=True)
|
||||
out_path = self.artifacts.shots_dir / f"screen_step_{self.step:03d}.png"
|
||||
self._save_image(image, out_path)
|
||||
data_url = image_to_data_url(image, "PNG")
|
||||
|
||||
self.last_screen_data_url = data_url
|
||||
self.last_screen_meta = meta | {"path": str(out_path.resolve())}
|
||||
return {
|
||||
"ok": True,
|
||||
"path": str(out_path.resolve()),
|
||||
"meta": self.last_screen_meta,
|
||||
"message": "Screen captured with coordinate grid.",
|
||||
}
|
||||
|
||||
def _tool_enhance(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
coord = args.get("coordinate") or {}
|
||||
x = int(coord.get("x", 0))
|
||||
y = int(coord.get("y", 0))
|
||||
base, base_meta = self._capture_screen(with_grid=False)
|
||||
width, height = base.size
|
||||
|
||||
region_half = 180
|
||||
left = clamp(x - region_half, 0, width - 1)
|
||||
top = clamp(y - region_half, 0, height - 1)
|
||||
right = clamp(x + region_half, left + 1, width)
|
||||
bottom = clamp(y + region_half, top + 1, height)
|
||||
|
||||
crop = base.crop((left, top, right, bottom))
|
||||
upscaled = crop.resize((crop.width * 2, crop.height * 2), Image.Resampling.BICUBIC)
|
||||
enhanced = ImageOps.autocontrast(upscaled)
|
||||
enhanced = ImageEnhance.Sharpness(enhanced).enhance(2.0)
|
||||
enhanced = ImageEnhance.Contrast(enhanced).enhance(1.25)
|
||||
enhanced = enhanced.filter(ImageFilter.UnsharpMask(radius=1.8, percent=180, threshold=2))
|
||||
|
||||
out_path = self.artifacts.enhance_dir / f"enhance_step_{self.step:03d}_{x}_{y}.png"
|
||||
self._save_image(enhanced, out_path)
|
||||
data_url = image_to_data_url(enhanced, "PNG")
|
||||
|
||||
meta = {
|
||||
"captured_at": utc_now_iso(),
|
||||
"source_coord": {"x": x, "y": y},
|
||||
"source_box": {"left": left, "top": top, "right": right, "bottom": bottom},
|
||||
"scale": 2,
|
||||
"path": str(out_path.resolve()),
|
||||
"screen_size": {"width": width, "height": height},
|
||||
"base_capture_meta": base_meta,
|
||||
}
|
||||
self.last_screen_data_url = data_url
|
||||
self.last_screen_meta = meta
|
||||
return {"ok": True, "meta": meta, "message": "Enhanced view generated."}
|
||||
|
||||
def _tool_click(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
coord = args.get("coordinate") or {}
|
||||
offset = args.get("offset") or {}
|
||||
base_x = int(coord.get("x", 0))
|
||||
base_y = int(coord.get("y", 0))
|
||||
|
||||
legacy_dx = self._parse_px(offset.get("x", 0))
|
||||
legacy_dy = self._parse_px(offset.get("y", 0))
|
||||
up = self._parse_px(args.get("offset_up", 0))
|
||||
down = self._parse_px(args.get("offset_down", 0))
|
||||
left = self._parse_px(args.get("offset_left", 0))
|
||||
right = self._parse_px(args.get("offset_right", 0))
|
||||
|
||||
x = base_x + legacy_dx + right - left
|
||||
y = base_y + legacy_dy + down - up
|
||||
width, height = pyautogui.size()
|
||||
x = clamp(x, 0, max(0, width - 1))
|
||||
y = clamp(y, 0, max(0, height - 1))
|
||||
|
||||
now = time.time()
|
||||
self.click_history.append((x, y, now))
|
||||
self.click_history = self.click_history[-20:]
|
||||
very_recent = [(cx, cy, ts) for (cx, cy, ts) in self.click_history if now - ts <= 25]
|
||||
near_same = [
|
||||
(cx, cy, ts)
|
||||
for (cx, cy, ts) in very_recent
|
||||
if abs(cx - x) <= 6 and abs(cy - y) <= 6
|
||||
]
|
||||
if len(near_same) >= 4:
|
||||
return {
|
||||
"ok": False,
|
||||
"blocked": True,
|
||||
"error": (
|
||||
"Repeated click loop detected at nearly same coordinate. "
|
||||
"Switch strategy: call see_screen/enhance and use execute_command."
|
||||
),
|
||||
"clicked": {"x": x, "y": y},
|
||||
"recent_similar_clicks": len(near_same),
|
||||
}
|
||||
|
||||
pyautogui.moveTo(x, y, duration=self.click_pause)
|
||||
pyautogui.click(x=x, y=y)
|
||||
sleep_after = self._parse_seconds(args.get("sleep_after_seconds", 0), default=0.0, max_seconds=30.0)
|
||||
if sleep_after > 0:
|
||||
time.sleep(sleep_after)
|
||||
else:
|
||||
time.sleep(0.15)
|
||||
|
||||
return {
|
||||
"ok": True,
|
||||
"clicked": {"x": x, "y": y},
|
||||
"base_coordinate": {"x": base_x, "y": base_y},
|
||||
"applied_offset": {
|
||||
"legacy": {"x": legacy_dx, "y": legacy_dy},
|
||||
"directional": {"up": up, "down": down, "left": left, "right": right},
|
||||
},
|
||||
"sleep_after_seconds": sleep_after,
|
||||
"screen_size": {"width": width, "height": height},
|
||||
"message": "Click executed.",
|
||||
}
|
||||
|
||||
def _tool_type(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
text = str(args.get("text", ""))
|
||||
pyautogui.write(text, interval=self.type_interval)
|
||||
return {"ok": True, "typed_length": len(text), "message": "Text typed."}
|
||||
|
||||
def _tool_press_key(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
key = str(args.get("key", "")).strip().lower()
|
||||
repeats = max(1, int(args.get("repeats", 1)))
|
||||
if not key:
|
||||
return {"ok": False, "error": "Missing key."}
|
||||
repeats = min(repeats, 50)
|
||||
for _ in range(repeats):
|
||||
pyautogui.press(key)
|
||||
time.sleep(0.03)
|
||||
return {"ok": True, "key": key, "repeats": repeats, "message": "Key press executed."}
|
||||
|
||||
def _tool_sleep(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
seconds = self._parse_seconds(args.get("seconds"), default=0.0, max_seconds=60.0)
|
||||
time.sleep(seconds)
|
||||
return {"ok": True, "slept_seconds": seconds, "message": "Sleep completed."}
|
||||
|
||||
def _tool_execute_command(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
command = str(args.get("command", "")).strip()
|
||||
if not command:
|
||||
return {"ok": False, "error": "Empty command."}
|
||||
|
||||
started = time.time()
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
command,
|
||||
shell=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.command_timeout,
|
||||
check=False,
|
||||
)
|
||||
elapsed_ms = int((time.time() - started) * 1000)
|
||||
return {
|
||||
"ok": True,
|
||||
"command": command,
|
||||
"exit_code": completed.returncode,
|
||||
"stdout": completed.stdout[-12000:],
|
||||
"stderr": completed.stderr[-12000:],
|
||||
"elapsed_ms": elapsed_ms,
|
||||
}
|
||||
except subprocess.TimeoutExpired as exc:
|
||||
elapsed_ms = int((time.time() - started) * 1000)
|
||||
return {
|
||||
"ok": False,
|
||||
"command": command,
|
||||
"error": "Command timed out.",
|
||||
"elapsed_ms": elapsed_ms,
|
||||
"timeout_seconds": self.command_timeout,
|
||||
"stdout": (exc.stdout or "")[-12000:],
|
||||
"stderr": (exc.stderr or "")[-12000:],
|
||||
}
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return {"ok": False, "command": command, "error": f"{type(exc).__name__}: {exc}"}
|
||||
|
||||
def _tool_task_complete(self, args: dict[str, Any]) -> dict[str, Any]:
|
||||
result = str(args.get("result", "")).strip() or "Task completed."
|
||||
self.completed = True
|
||||
self.final_result = result
|
||||
return {"ok": True, "result": result}
|
||||
|
||||
def _dispatch_tool(self, name: str, args: dict[str, Any]) -> dict[str, Any]:
|
||||
handlers = {
|
||||
"see_screen": self._tool_see_screen,
|
||||
"enhance": self._tool_enhance,
|
||||
"click": self._tool_click,
|
||||
"type": self._tool_type,
|
||||
"press_key": self._tool_press_key,
|
||||
"sleep": self._tool_sleep,
|
||||
"execute_command": self._tool_execute_command,
|
||||
"task_complete": self._tool_task_complete,
|
||||
}
|
||||
handler = handlers.get(name)
|
||||
if handler is None:
|
||||
return {"ok": False, "error": f"Unknown tool: {name}"}
|
||||
return handler(args)
|
||||
|
||||
def _safe_parse_args(self, raw: str | None) -> dict[str, Any]:
|
||||
if not raw:
|
||||
return {}
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
return parsed if isinstance(parsed, dict) else {"value": parsed}
|
||||
except Exception: # noqa: BLE001
|
||||
return {"_raw": raw}
|
||||
|
||||
def _call_model(self, input_items: list[dict[str, Any]]) -> Any:
|
||||
return self.client.responses.create(
|
||||
model=self.model,
|
||||
instructions=SYSTEM_PROMPT,
|
||||
tools=self._tool_schemas(),
|
||||
input=input_items,
|
||||
previous_response_id=self.previous_response_id,
|
||||
parallel_tool_calls=True,
|
||||
max_tool_calls=8,
|
||||
)
|
||||
|
||||
def run(self, job: str) -> AgentResult:
|
||||
started_at = time.time()
|
||||
self.logger.info("Starting run_id=%s model=%s", self.artifacts.run_id, self.model)
|
||||
self.logger.info("Job: %s", job)
|
||||
|
||||
self._tool_see_screen({})
|
||||
init_input: list[dict[str, Any]] = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": (
|
||||
f"JOB: {job}\n"
|
||||
"You are in an action loop. Prefer execute_command for deterministic actions. "
|
||||
"You can return multiple tool calls in one step (example: click then sleep). "
|
||||
"Call task_complete(result=...) only when truly done."
|
||||
),
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
if self.last_screen_data_url and self.last_screen_meta:
|
||||
init_input.append(
|
||||
self._build_visual_message("Initial screen capture", self.last_screen_data_url, self.last_screen_meta)
|
||||
)
|
||||
|
||||
pending_input = init_input
|
||||
|
||||
while self.step < self.max_steps and not self.completed:
|
||||
self.step += 1
|
||||
self.logger.info("---- Agent step %d/%d ----", self.step, self.max_steps)
|
||||
try:
|
||||
response = self._call_model(pending_input)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.exception("OpenAI API call failed on step %d", self.step)
|
||||
raise RuntimeError(f"OpenAI API call failed: {type(exc).__name__}: {exc}") from exc
|
||||
|
||||
self.previous_response_id = response.id
|
||||
output_items = list(response.output or [])
|
||||
text_preview = getattr(response, "output_text", "") or ""
|
||||
if text_preview.strip():
|
||||
self.logger.info("Model text: %s", text_preview.strip()[:500])
|
||||
|
||||
tool_calls = [item for item in output_items if getattr(item, "type", None) == "function_call"]
|
||||
if not tool_calls:
|
||||
self.logger.warning("No tool calls returned; nudging model to continue with tools.")
|
||||
pending_input = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": (
|
||||
"No function call was returned. Continue by using tools. "
|
||||
"You may call multiple tools in one step. "
|
||||
"When complete, call task_complete(result=...)."
|
||||
),
|
||||
}
|
||||
],
|
||||
}
|
||||
]
|
||||
continue
|
||||
|
||||
next_input: list[dict[str, Any]] = []
|
||||
for tool_call in tool_calls:
|
||||
name = str(getattr(tool_call, "name", ""))
|
||||
call_id = str(getattr(tool_call, "call_id", ""))
|
||||
args_raw = getattr(tool_call, "arguments", "{}")
|
||||
args = self._safe_parse_args(args_raw)
|
||||
|
||||
self.logger.info("Tool call: %s args=%s", name, json.dumps(args, ensure_ascii=False))
|
||||
try:
|
||||
result = self._dispatch_tool(name, args)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.exception("Tool execution failed: %s", name)
|
||||
result = {
|
||||
"ok": False,
|
||||
"error": f"{type(exc).__name__}: {exc}",
|
||||
"traceback": traceback.format_exc()[-8000:],
|
||||
}
|
||||
|
||||
self.logger.debug(
|
||||
"Tool result for %s: %s",
|
||||
name,
|
||||
json.dumps(result, ensure_ascii=False)[:2500],
|
||||
)
|
||||
next_input.append(
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": call_id,
|
||||
"output": json.dumps(result, ensure_ascii=False),
|
||||
}
|
||||
)
|
||||
|
||||
if name in ("see_screen", "enhance") and self.last_screen_data_url and self.last_screen_meta:
|
||||
title = "Updated screen capture" if name == "see_screen" else "Enhanced screen region"
|
||||
next_input.append(
|
||||
self._build_visual_message(title, self.last_screen_data_url, self.last_screen_meta)
|
||||
)
|
||||
|
||||
pending_input = next_input
|
||||
|
||||
ended_at = time.time()
|
||||
if self.completed:
|
||||
self.logger.info("Task completed in %d step(s).", self.step)
|
||||
return AgentResult(
|
||||
completed=True,
|
||||
result=self.final_result,
|
||||
steps=self.step,
|
||||
started_at=started_at,
|
||||
ended_at=ended_at,
|
||||
)
|
||||
|
||||
self.logger.warning("Stopped due to step limit (%d).", self.max_steps)
|
||||
return AgentResult(
|
||||
completed=False,
|
||||
result=f"Stopped after max steps ({self.max_steps}) without task_complete.",
|
||||
steps=self.step,
|
||||
started_at=started_at,
|
||||
ended_at=ended_at,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user