81 lines
2.3 KiB
Python
81 lines
2.3 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import Any
|
|
|
|
from openai import OpenAI
|
|
|
|
|
|
SAFETY_PROMPT = """
|
|
You are a safety gate for a desktop automation agent that can execute terminal commands, click/type on screen, and browse websites.
|
|
Classify whether the user objective can be executed safely and legally.
|
|
|
|
Return strict JSON only:
|
|
{
|
|
"safe": true|false,
|
|
"reason": "short reason",
|
|
"risk_level": "low|medium|high",
|
|
"recommended_disabled_tools": ["optional_tool_names"]
|
|
}
|
|
"""
|
|
|
|
|
|
def _extract_json(text: str) -> dict[str, Any]:
|
|
text = text.strip()
|
|
try:
|
|
parsed = json.loads(text)
|
|
return parsed if isinstance(parsed, dict) else {}
|
|
except Exception:
|
|
pass
|
|
start = text.find("{")
|
|
end = text.rfind("}")
|
|
if start >= 0 and end > start:
|
|
try:
|
|
parsed = json.loads(text[start : end + 1])
|
|
return parsed if isinstance(parsed, dict) else {}
|
|
except Exception:
|
|
return {}
|
|
return {}
|
|
|
|
|
|
def assess_task_safety(
|
|
client: OpenAI,
|
|
*,
|
|
model: str,
|
|
objective: str,
|
|
disabled_tools: list[str],
|
|
) -> tuple[bool, str, dict[str, Any]]:
|
|
try:
|
|
response = client.responses.create(
|
|
model=model,
|
|
instructions=SAFETY_PROMPT,
|
|
input=[
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "input_text",
|
|
"text": (
|
|
"Objective:\n"
|
|
f"{objective}\n\n"
|
|
f"Disabled tools: {json.dumps(disabled_tools, ensure_ascii=False)}\n"
|
|
"Answer with strict JSON only."
|
|
),
|
|
}
|
|
],
|
|
}
|
|
],
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
return False, f"Safety check failed: {type(exc).__name__}: {exc}", {"safe": False}
|
|
|
|
raw_text = getattr(response, "output_text", "") or ""
|
|
parsed = _extract_json(raw_text)
|
|
safe = bool(parsed.get("safe", False))
|
|
reason = str(parsed.get("reason", "")).strip() or "No reason provided by safety check."
|
|
if not parsed:
|
|
safe = False
|
|
reason = "Safety check returned unparseable response."
|
|
return safe, reason, parsed
|
|
|