screenjob/src/safety.py

from __future__ import annotations

import json
from typing import Any

from openai import OpenAI


SAFETY_PROMPT = """
You are a safety gate for a desktop automation agent that can execute terminal commands, click/type on screen, and browse websites.
Classify whether the user objective can be executed safely and legally.

Return strict JSON only:
{
  "safe": true|false,
  "reason": "short reason",
  "risk_level": "low|medium|high",
  "recommended_disabled_tools": ["optional_tool_names"]
}
"""


def _extract_json(text: str) -> dict[str, Any]:
    text = text.strip()
    try:
        parsed = json.loads(text)
        return parsed if isinstance(parsed, dict) else {}
    except Exception:
        pass
    start = text.find("{")
    end = text.rfind("}")
    if start >= 0 and end > start:
        try:
            parsed = json.loads(text[start : end + 1])
            return parsed if isinstance(parsed, dict) else {}
        except Exception:
            return {}
    return {}


def assess_task_safety(
    client: OpenAI,
    *,
    model: str,
    objective: str,
    disabled_tools: list[str],
) -> tuple[bool, str, dict[str, Any]]:
    try:
        response = client.responses.create(
            model=model,
            instructions=SAFETY_PROMPT,
            input=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "input_text",
                            "text": (
                                "Objective:\n"
                                f"{objective}\n\n"
                                f"Disabled tools: {json.dumps(disabled_tools, ensure_ascii=False)}\n"
                                "Answer with strict JSON only."
                            ),
                        }
                    ],
                }
            ],
        )
    except Exception as exc:  # noqa: BLE001
        return False, f"Safety check failed: {type(exc).__name__}: {exc}", {"safe": False}

    raw_text = getattr(response, "output_text", "") or ""
    parsed = _extract_json(raw_text)
    safe = bool(parsed.get("safe", False))
    reason = str(parsed.get("reason", "")).strip() or "No reason provided by safety check."
    if not parsed:
        safe = False
        reason = "Safety check returned unparseable response."
    return safe, reason, parsed