from __future__ import annotations import json from typing import Any from openai import OpenAI SAFETY_PROMPT = """ You are a safety gate for a desktop automation agent that can execute terminal commands, click/type on screen, and browse websites. Classify whether the user objective can be executed safely and legally. Return strict JSON only: { "safe": true|false, "reason": "short reason", "risk_level": "low|medium|high", "recommended_disabled_tools": ["optional_tool_names"] } """ def _extract_json(text: str) -> dict[str, Any]: text = text.strip() try: parsed = json.loads(text) return parsed if isinstance(parsed, dict) else {} except Exception: pass start = text.find("{") end = text.rfind("}") if start >= 0 and end > start: try: parsed = json.loads(text[start : end + 1]) return parsed if isinstance(parsed, dict) else {} except Exception: return {} return {} def assess_task_safety( client: OpenAI, *, model: str, objective: str, disabled_tools: list[str], ) -> tuple[bool, str, dict[str, Any]]: try: response = client.responses.create( model=model, instructions=SAFETY_PROMPT, input=[ { "role": "user", "content": [ { "type": "input_text", "text": ( "Objective:\n" f"{objective}\n\n" f"Disabled tools: {json.dumps(disabled_tools, ensure_ascii=False)}\n" "Answer with strict JSON only." ), } ], } ], ) except Exception as exc: # noqa: BLE001 return False, f"Safety check failed: {type(exc).__name__}: {exc}", {"safe": False} raw_text = getattr(response, "output_text", "") or "" parsed = _extract_json(raw_text) safe = bool(parsed.get("safe", False)) reason = str(parsed.get("reason", "")).strip() or "No reason provided by safety check." if not parsed: safe = False reason = "Safety check returned unparseable response." return safe, reason, parsed