gitea-codex/src/gitea_codex_bot/services/reviewer.py

from __future__ import annotations

import json
import os
import shlex
import subprocess
from fnmatch import fnmatch
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any

import httpx

from gitea_codex_bot.config import Settings
from gitea_codex_bot.services.gitea import GiteaClient, PullRequestContext
from gitea_codex_bot.services.repo_config import RepoReviewConfig, load_repo_review_config
from gitea_codex_bot.types import ParsedCommand


class ReviewError(RuntimeError):
    pass


def _run_git(args: list[str], cwd: Path | None = None) -> str:
    completed = subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True)
    return completed.stdout


def checkout_pr(tmpdir: Path, pr: PullRequestContext) -> Path:
    repo_dir = tmpdir / "repo"
    _run_git(["clone", "--no-tags", "--depth", "50", pr.clone_url, str(repo_dir)])
    _run_git(["fetch", "origin", pr.base_ref, pr.head_ref], cwd=repo_dir)
    _run_git(["checkout", pr.head_sha], cwd=repo_dir)
    return repo_dir


def collect_diff_context(repo_dir: Path, pr: PullRequestContext, max_diff_bytes: int) -> dict[str, Any]:
    diff = _run_git(["diff", f"{pr.base_sha}...{pr.head_sha}"], cwd=repo_dir)
    changed_files_raw = _run_git(["diff", "--name-only", f"{pr.base_sha}...{pr.head_sha}"], cwd=repo_dir)
    changed_files = [line.strip() for line in changed_files_raw.splitlines() if line.strip()]
    truncated = False
    if len(diff.encode("utf-8")) > max_diff_bytes:
        diff = diff.encode("utf-8")[:max_diff_bytes].decode("utf-8", errors="ignore")
        truncated = True
    return {"diff": diff, "changed_files": changed_files, "truncated": truncated}


def _apply_ignore_patterns(changed_files: list[str], ignore_patterns: list[str]) -> list[str]:
    if not ignore_patterns:
        return changed_files
    kept: list[str] = []
    for path in changed_files:
        if any(fnmatch(path, pattern) for pattern in ignore_patterns):
            continue
        kept.append(path)
    return kept


def _collect_changed_file_contents(repo_dir: Path, changed_files: list[str], max_total_bytes: int) -> str:
    chunks: list[str] = []
    total = 0
    for rel in changed_files:
        path = repo_dir / rel
        if not path.exists() or not path.is_file():
            continue
        try:
            content = path.read_text(encoding="utf-8", errors="ignore")
        except OSError:
            continue
        block = f"\n### {rel}\n{content}\n"
        block_bytes = len(block.encode("utf-8"))
        if total + block_bytes > max_total_bytes:
            break
        chunks.append(block)
        total += block_bytes
    return "".join(chunks).strip()


def _collect_test_output(repo_dir: Path, timeout_seconds: int) -> str:
    try:
        completed = subprocess.run(
            ["pytest", "-q"],
            cwd=repo_dir,
            capture_output=True,
            text=True,
            timeout=timeout_seconds,
            check=False,
        )
        output = (completed.stdout + "\n" + completed.stderr).strip()
        return output[:10000]
    except Exception as exc:
        return f"Test execution unavailable: {exc}"


def _redact_secrets_from_diff(diff: str) -> str:
    secret_terms = ("api_key", "token", "secret", "password", "private_key", "-----begin")
    redacted_lines: list[str] = []
    for line in diff.splitlines():
        lower = line.lower()
        if any(term in lower for term in secret_terms):
            redacted_lines.append("[REDACTED_POTENTIAL_SECRET]")
        else:
            redacted_lines.append(line)
    return "\n".join(redacted_lines)


def _build_prompt(
    pr: PullRequestContext,
    command: ParsedCommand,
    diff_context: dict[str, Any],
    repo_cfg: RepoReviewConfig,
    *,
    changed_file_contents: str,
    test_output: str | None,
) -> str:
    mode = command.mode if command.name in {"review", "rerun"} else "summary"
    changed_files = diff_context.get("changed_files") or []
    changed_files_section = os.linesep.join(changed_files) if changed_files else "(none)"
    unified_diff = str(diff_context.get("diff", ""))
    unified_diff_section = unified_diff if unified_diff.strip() else "(empty)"
    return (
        "You are reviewing a Gitea pull request.\n\n"
        "Focus only on issues introduced by this PR.\n"
        "Prioritize correctness, security, data loss, broken behavior, bad migrations, and missing tests.\n"
        "Avoid style nitpicks.\n\n"
        "You do not have internet/network access. Do not try to fetch URLs.\n"
        "Use only the PR metadata, changed files, diff, and optional file/test content included below.\n"
        "Never claim that PR content is inaccessible or missing if these sections are present.\n"
        "If the changed-file list is `(none)` and unified diff is `(empty)`, treat this as a no-op PR and explain that no code changes were detected.\n\n"
        "Return JSON only with schema:\n"
        "{\n"
        '  "verdict": "correct" | "has_issues",\n'
        '  "confidence": 0.0,\n'
        '  "summary": "...",\n'
        '  "findings": [{"severity":"low|medium|high|critical","file":"...","line_start":1,"line_end":1,"title":"...","body":"...","suggestion":"..."}],\n'
        '  "markdown_comment": "Full markdown comment body to post to Gitea. Include clear section breaks and blank lines."\n'
        "}\n\n"
        f"PR URL: {pr.html_url}\n"
        f"Mode: {mode}\n"
        f"Trigger message: {command.raw}\n"
        f"Repo focus: {', '.join(repo_cfg.focus)}\n"
        f"Diff truncated: {diff_context['truncated']}\n"
        f"Changed files:\n{changed_files_section}\n\n"
        f"Unified diff:\n{unified_diff_section}\n\n"
        f"Changed file content (optional):\n{changed_file_contents or '(not included)'}\n\n"
        f"Test output (optional):\n{test_output or '(not included)'}\n"
    )


def _call_openai_review(settings: Settings, prompt: str) -> dict[str, Any]:
    api_key = settings.openai_api_key.get_secret_value() if settings.openai_api_key else ""
    if not api_key.strip():
        raise ReviewError("OPENAI_API_KEY is required for API-key review mode.")
    headers: dict[str, str] = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }
    if settings.openai_org_id:
        headers["OpenAI-Organization"] = settings.openai_org_id
    if settings.openai_project_id:
        headers["OpenAI-Project"] = settings.openai_project_id

    body = {
        "model": settings.openai_review_model,
        "input": prompt,
        "text": {"format": {"type": "json_object"}},
    }
    with httpx.Client(timeout=120.0) as client:
        response = client.post("https://api.openai.com/v1/responses", headers=headers, json=body)
        response.raise_for_status()
        payload = response.json()

    for item in payload.get("output", []):
        for content in item.get("content", []):
            text_value = content.get("text")
            if text_value:
                result = json.loads(text_value)
                if isinstance(result, dict):
                    result["_meta"] = _build_openai_result_meta(payload, settings)
                return result
    raise ReviewError("OpenAI response did not contain JSON output text.")


def _build_openai_result_meta(payload: dict[str, Any], settings: Settings) -> dict[str, Any]:
    usage_raw = payload.get("usage")
    usage: dict[str, int] = {}
    if isinstance(usage_raw, dict):
        for output_key, source_key in (
            ("input_tokens", "input_tokens"),
            ("output_tokens", "output_tokens"),
            ("total_tokens", "total_tokens"),
        ):
            value = usage_raw.get(source_key)
            if isinstance(value, int):
                usage[output_key] = value
    model = payload.get("model")
    if not isinstance(model, str) or not model.strip():
        model = settings.openai_review_model
    return {"source": "openai_api", "model": model, "usage": usage}


def _summarize_openai_failure(exc: Exception) -> str:
    if isinstance(exc, httpx.HTTPStatusError):
        status = exc.response.status_code
        response_text = exc.response.text.strip()
        if response_text:
            compact = " ".join(response_text.split())
            if len(compact) > 400:
                compact = f"{compact[:400]}..."
            return f"OpenAI API HTTP {status}: {compact}"
        return f"OpenAI API HTTP {status}."
    if isinstance(exc, httpx.TimeoutException):
        return "OpenAI API request timed out."
    message = str(exc).strip()
    if message:
        return message
    return f"{exc.__class__.__name__} (no details)"


def _fallback_review(diff_context: dict[str, Any], *, failure_reason: str | None = None) -> dict[str, Any]:
    findings: list[dict[str, Any]] = []
    summary = "Fallback analysis was used because OpenAI review was unavailable."

    if failure_reason:
        summary = f"OpenAI review failed. Error: {failure_reason}"
        findings.append(
            {
                "severity": "high",
                "file": "unknown",
                "line_start": 1,
                "line_end": 1,
                "title": "OpenAI review request failed",
                "body": failure_reason,
                "suggestion": "Fix API/auth/network issues and rerun @codex review.",
            }
        )

    return {
        "verdict": "correct" if not findings else "has_issues",
        "confidence": 0.4 if not findings else 0.6,
        "summary": summary,
        "findings": findings,
    }


def run_review_for_pr(
    settings: Settings,
    gitea: GiteaClient,
    repo: str,
    pr_number: int,
    command: ParsedCommand,
) -> tuple[dict[str, Any], RepoReviewConfig]:
    prompt, diff_context, repo_cfg = prepare_review_prompt(settings, gitea, repo, pr_number, command)
    try:
        result = _call_openai_review(settings, prompt)
    except Exception as exc:
        result = _fallback_review(diff_context, failure_reason=_summarize_openai_failure(exc))
    return normalize_review_result(result), repo_cfg


def prepare_review_prompt(
    settings: Settings,
    gitea: GiteaClient,
    repo: str,
    pr_number: int,
    command: ParsedCommand,
) -> tuple[str, dict[str, Any], RepoReviewConfig]:
    pr = gitea.get_pull_request(repo, pr_number)
    with TemporaryDirectory(prefix="gitea-codex-") as tmp:
        tmpdir = Path(tmp)
        repo_dir = checkout_pr(tmpdir, pr)
        repo_cfg = load_repo_review_config(repo_dir)
        if command.name == "review" and not command.mode_explicit:
            configured_mode = repo_cfg.default_mode
            command.mode = configured_mode if configured_mode in {"summary", "security", "performance", "tests", "full"} else "summary"
        diff_context = collect_diff_context(repo_dir, pr, min(settings.max_diff_bytes, repo_cfg.max_diff_bytes))
        diff_context["changed_files"] = _apply_ignore_patterns(diff_context["changed_files"], repo_cfg.ignore)
        diff_context["diff"] = _redact_secrets_from_diff(diff_context["diff"])
        changed_file_contents = ""
        if command.full:
            changed_file_contents = _collect_changed_file_contents(repo_dir, diff_context["changed_files"], settings.max_diff_bytes)
        test_output = None
        if repo_cfg.include_tests and command.mode == "tests":
            test_output = _collect_test_output(repo_dir, timeout_seconds=min(settings.max_review_minutes * 60, 300))
        prompt = _build_prompt(
            pr,
            command,
            diff_context,
            repo_cfg,
            changed_file_contents=changed_file_contents,
            test_output=test_output,
        )
    return prompt, diff_context, repo_cfg


def normalize_review_result(result: Any) -> dict[str, Any]:
    if not isinstance(result, dict):
        raise ReviewError(f"Invalid review result type: {type(result)!r}")
    if "findings" not in result:
        result["findings"] = []
    if "summary" not in result:
        result["summary"] = "No summary returned."
    if "verdict" not in result:
        result["verdict"] = "has_issues"
    if "confidence" not in result:
        result["confidence"] = 0.5
    return result


def summarize_command(command: ParsedCommand) -> str:
    return " ".join(["@codex", command.name, *command.arguments]).strip()


def fix_branch_name(pr_number: int, arguments: list[str] | None = None) -> str:
    suffix = "fix"
    if arguments:
        words = [token.lower().strip() for token in arguments if token.strip() and not token.startswith("--")]
        if words:
            clean = "-".join(words[:4])
            cleaned = "".join(ch if ch.isalnum() or ch == "-" else "-" for ch in clean).strip("-")
            if cleaned:
                suffix = f"fix-{cleaned}"
    return f"codex/pr-{pr_number}-{suffix}"


def create_fix_patch_note(command: ParsedCommand) -> str:
    details = shlex.join(command.arguments) if command.arguments else "latest findings"
    return f"Fix command requested for {details}."


def create_fix_branch(
    pr: PullRequestContext,
    *,
    note: str,
    arguments: list[str] | None = None,
) -> str:
    branch = fix_branch_name(pr.pr_number, arguments=arguments)
    with TemporaryDirectory(prefix="gitea-codex-fix-") as tmp:
        tmpdir = Path(tmp)
        repo_dir = checkout_pr(tmpdir, pr)
        _run_git(["checkout", "-b", branch], cwd=repo_dir)
        notes_dir = repo_dir / ".codex"
        notes_dir.mkdir(parents=True, exist_ok=True)
        (notes_dir / "fix-note.md").write_text(f"# Codex Fix Note\n\n{note}\n", encoding="utf-8")
        _run_git(["add", ".codex/fix-note.md"], cwd=repo_dir)
        _run_git(["-c", "user.name=codex-bot", "-c", "user.email=codex-bot@example.invalid", "commit", "-m", f"Codex fix note for PR {pr.pr_number}"], cwd=repo_dir)
        _run_git(["push", "origin", f"{branch}:{branch}", "--force"], cwd=repo_dir)
    return branch