Files
screenjob/src/safety.py

81 lines
2.3 KiB
Python

from __future__ import annotations
import json
from typing import Any
from openai import OpenAI
SAFETY_PROMPT = """
You are a safety gate for a desktop automation agent that can execute terminal commands, click/type on screen, and browse websites.
Classify whether the user objective can be executed safely and legally.
Return strict JSON only:
{
"safe": true|false,
"reason": "short reason",
"risk_level": "low|medium|high",
"recommended_disabled_tools": ["optional_tool_names"]
}
"""
def _extract_json(text: str) -> dict[str, Any]:
text = text.strip()
try:
parsed = json.loads(text)
return parsed if isinstance(parsed, dict) else {}
except Exception:
pass
start = text.find("{")
end = text.rfind("}")
if start >= 0 and end > start:
try:
parsed = json.loads(text[start : end + 1])
return parsed if isinstance(parsed, dict) else {}
except Exception:
return {}
return {}
def assess_task_safety(
client: OpenAI,
*,
model: str,
objective: str,
disabled_tools: list[str],
) -> tuple[bool, str, dict[str, Any]]:
try:
response = client.responses.create(
model=model,
instructions=SAFETY_PROMPT,
input=[
{
"role": "user",
"content": [
{
"type": "input_text",
"text": (
"Objective:\n"
f"{objective}\n\n"
f"Disabled tools: {json.dumps(disabled_tools, ensure_ascii=False)}\n"
"Answer with strict JSON only."
),
}
],
}
],
)
except Exception as exc: # noqa: BLE001
return False, f"Safety check failed: {type(exc).__name__}: {exc}", {"safe": False}
raw_text = getattr(response, "output_text", "") or ""
parsed = _extract_json(raw_text)
safe = bool(parsed.get("safe", False))
reason = str(parsed.get("reason", "")).strip() or "No reason provided by safety check."
if not parsed:
safe = False
reason = "Safety check returned unparseable response."
return safe, reason, parsed