Add whisper remote backend and CLI

2026-05-24 12:36:24 +02:00
parent ad10a62ae6
commit aa89590555
12 changed files with 445 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__/
+.pytest_cache/
+*.egg-info/
--- a/backend/README.md
+++ b/backend/README.md
@@ -0,0 +1,23 @@
+# whisper-remote-backend
+
+FastAPI wrapper around the upstream `whisper` CLI from `openai/whisper`.
+
+## Run
+
+```bash
+pip install -e .
+uvicorn whisper_remote_backend.server:app --host 0.0.0.0 --port 8000
+```
+
+## API
+
+`POST /transcriptions`
+
+Multipart form fields:
+
+- `file`: media file
+- `model`: Whisper model name
+- `language`: optional language code
+- `output_format`: `txt`, `vtt`, `srt`, `tsv`, or `json`
+
+The response body is the transcript artifact itself. The backend deletes the uploaded file and generated output after each request.
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -0,0 +1,30 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "whisper-remote-backend"
+version = "0.1.0"
+description = "FastAPI wrapper around the openai/whisper CLI"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "fastapi>=0.115.0,<1.0.0",
+  "python-multipart>=0.0.9,<1.0.0",
+  "uvicorn>=0.32.0,<1.0.0",
+]
+
+[project.scripts]
+whisper-remote-server = "whisper_remote_backend.server:main"
+
+[project.optional-dependencies]
+dev = [
+  "httpx>=0.28.0,<1.0.0",
+  "pytest>=8.3.0,<9.0.0",
+]
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
--- a/backend/src/whisper_remote_backend/init.py
+++ b/backend/src/whisper_remote_backend/init.py
@@ -0,0 +1 @@
+"""whisper-remote backend package."""
--- a/backend/src/whisper_remote_backend/server.py
+++ b/backend/src/whisper_remote_backend/server.py
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.responses import Response
+
+SUPPORTED_FORMATS = {"txt", "vtt", "srt", "tsv", "json"}
+CONTENT_TYPES = {
+    "txt": "text/plain; charset=utf-8",
+    "vtt": "text/vtt; charset=utf-8",
+    "srt": "application/x-subrip; charset=utf-8",
+    "tsv": "text/tab-separated-values; charset=utf-8",
+    "json": "application/json; charset=utf-8",
+}
+
+app = FastAPI(title="whisper-remote-backend")
+
+
+def validate_output_format(output_format: str) -> str:
+    normalized = output_format.strip().lower()
+    if normalized not in SUPPORTED_FORMATS:
+        supported = ", ".join(sorted(SUPPORTED_FORMATS))
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported output format '{output_format}'. Supported formats: {supported}.",
+        )
+    return normalized
+
+
+def build_whisper_command(
+    *,
+    input_path: Path,
+    output_dir: Path,
+    model: str,
+    language: str | None,
+    output_format: str,
+) -> list[str]:
+    command = [
+        "whisper",
+        str(input_path),
+        "--model",
+        model,
+        "--output_format",
+        output_format,
+        "--output_dir",
+        str(output_dir),
+    ]
+    if language:
+        command.extend(["--language", language])
+    return command
+
+
+async def save_upload(upload: UploadFile, destination: Path) -> None:
+    with destination.open("wb") as handle:
+        while chunk := await upload.read(1024 * 1024):
+            handle.write(chunk)
+    await upload.close()
+
+
+def find_transcript_file(output_dir: Path, input_name: str, output_format: str) -> Path:
+    expected = output_dir / f"{Path(input_name).stem}.{output_format}"
+    if expected.exists():
+        return expected
+
+    matches = list(output_dir.glob(f"*.{output_format}"))
+    if len(matches) == 1:
+        return matches[0]
+
+    raise HTTPException(
+        status_code=500,
+        detail="Whisper finished without producing the expected output file.",
+    )
+
+
+@app.get("/health")
+def healthcheck() -> dict[str, str]:
+    return {"status": "ok"}
+
+
+@app.post("/transcriptions")
+async def transcribe(
+    file: UploadFile = File(...),
+    model: str = Form(...),
+    language: str | None = Form(default=None),
+    output_format: str = Form(...),
+) -> Response:
+    normalized_format = validate_output_format(output_format)
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="Uploaded file must have a filename.")
+
+    with TemporaryDirectory(prefix="whisper-remote-upload-") as upload_root, TemporaryDirectory(
+        prefix="whisper-remote-output-"
+    ) as output_root:
+        input_path = Path(upload_root) / Path(file.filename).name
+        output_dir = Path(output_root)
+        await save_upload(file, input_path)
+
+        command = build_whisper_command(
+            input_path=input_path,
+            output_dir=output_dir,
+            model=model,
+            language=language,
+            output_format=normalized_format,
+        )
+
+        try:
+            completed = subprocess.run(
+                command,
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+        except FileNotFoundError as exc:
+            raise HTTPException(
+                status_code=500,
+                detail="The 'whisper' CLI was not found on PATH on the backend host.",
+            ) from exc
+
+        if completed.returncode != 0:
+            detail = completed.stderr.strip() or completed.stdout.strip() or "Whisper CLI failed."
+            raise HTTPException(status_code=502, detail=detail)
+
+        transcript_path = find_transcript_file(output_dir, file.filename, normalized_format)
+        content = transcript_path.read_bytes()
+        download_name = f"{Path(file.filename).stem}.{normalized_format}"
+
+        return Response(
+            content=content,
+            media_type=CONTENT_TYPES[normalized_format],
+            headers={
+                "Content-Disposition": f'attachment; filename="{download_name}"',
+                "X-Whisper-Output-Format": normalized_format,
+                "X-Whisper-Model": model,
+            },
+        )
+
+
+def main() -> None:
+    import uvicorn
+
+    uvicorn.run("whisper_remote_backend.server:app", host="0.0.0.0", port=8000)
--- a/backend/tests/test_server.py
+++ b/backend/tests/test_server.py
@@ -0,0 +1,63 @@
+from pathlib import Path
+
+from fastapi.testclient import TestClient
+
+from whisper_remote_backend import server
+
+
+client = TestClient(server.app)
+
+
+def test_validate_output_format_rejects_unknown() -> None:
+    try:
+        server.validate_output_format("docx")
+    except Exception as exc:  # pragma: no cover - structure assertion below
+        assert getattr(exc, "status_code", None) == 400
+    else:  # pragma: no cover
+        raise AssertionError("Expected HTTPException")
+
+
+def test_transcriptions_returns_generated_artifact(monkeypatch, tmp_path: Path) -> None:
+    def fake_run(command: list[str], check: bool, capture_output: bool, text: bool):
+        output_dir = Path(command[command.index("--output_dir") + 1])
+        (output_dir / "clip.txt").write_text("hello world", encoding="utf-8")
+
+        class Result:
+            returncode = 0
+            stdout = ""
+            stderr = ""
+
+        return Result()
+
+    monkeypatch.setattr(server.subprocess, "run", fake_run)
+
+    response = client.post(
+        "/transcriptions",
+        data={"model": "base", "language": "en", "output_format": "txt"},
+        files={"file": ("clip.wav", b"audio", "audio/wav")},
+    )
+
+    assert response.status_code == 200
+    assert response.text == "hello world"
+    assert response.headers["x-whisper-output-format"] == "txt"
+
+
+def test_transcriptions_maps_subprocess_failure(monkeypatch) -> None:
+    def fake_run(command: list[str], check: bool, capture_output: bool, text: bool):
+        class Result:
+            returncode = 1
+            stdout = ""
+            stderr = "bad whisper day"
+
+        return Result()
+
+    monkeypatch.setattr(server.subprocess, "run", fake_run)
+
+    response = client.post(
+        "/transcriptions",
+        data={"model": "base", "output_format": "txt"},
+        files={"file": ("clip.wav", b"audio", "audio/wav")},
+    )
+
+    assert response.status_code == 502
+    assert response.json()["detail"] == "bad whisper day"
--- a/cli/README.md
+++ b/cli/README.md
@@ -0,0 +1,13 @@
+# whisper-remote
+
+Local CLI that forwards media files to a remote `whisper-remote-backend` server.
+
+## Run
+
+```bash
+pip install -e .
+export WHISPER_REMOTE=http://127.0.0.1:8000
+whisper-remote ./audio.mp3 --model base --language en --output-format txt
+```
+
+Use `--to-file` to save the returned transcript locally.
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -0,0 +1,27 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "whisper-remote-cli"
+version = "0.1.0"
+description = "CLI that forwards transcription requests to whisper-remote-backend"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "httpx>=0.28.0,<1.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.3.0,<9.0.0",
+]
+
+[project.scripts]
+whisper-remote = "whisper_remote_cli.main:main"
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
--- a/cli/src/whisper_remote_cli/init.py
+++ b/cli/src/whisper_remote_cli/init.py
@@ -0,0 +1 @@
+"""whisper-remote CLI package."""
--- a/cli/src/whisper_remote_cli/main.py
+++ b/cli/src/whisper_remote_cli/main.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import httpx
+
+SUPPORTED_FORMATS = ("txt", "vtt", "srt", "tsv", "json")
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Send transcription jobs to a remote Whisper backend.")
+    parser.add_argument("file", type=Path, help="Path to the local media file to upload.")
+    parser.add_argument("--model", required=True, help="Whisper model name to use on the backend.")
+    parser.add_argument("--language", help="Optional language code to pass through to Whisper.")
+    parser.add_argument(
+        "--output-format",
+        default="txt",
+        choices=SUPPORTED_FORMATS,
+        help="Transcript artifact format returned by the backend.",
+    )
+    parser.add_argument(
+        "--server",
+        help="Override the backend base URL. Defaults to the WHISPER_REMOTE environment variable.",
+    )
+    parser.add_argument(
+        "--to-file",
+        type=Path,
+        help="Optional local file path or directory to save the returned transcript artifact.",
+    )
+    return parser
+
+
+def resolve_server(args: argparse.Namespace) -> str:
+    server = args.server or os.environ.get("WHISPER_REMOTE")
+    if not server:
+        raise SystemExit("WHISPER_REMOTE is not set and --server was not provided.")
+    return server.rstrip("/")
+
+
+def infer_output_path(target: Path, input_file: Path, output_format: str) -> Path:
+    if target.exists() and target.is_dir():
+        return target / f"{input_file.stem}.{output_format}"
+    if target.suffix:
+        return target
+    return target / f"{input_file.stem}.{output_format}"
+
+
+def print_response(response: httpx.Response) -> None:
+    sys.stdout.write(response.text)
+    if response.text and not response.text.endswith("\n"):
+        sys.stdout.write("\n")
+
+
+def save_response(response: httpx.Response, destination: Path) -> None:
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    destination.write_bytes(response.content)
+
+
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+
+    input_file = args.file.expanduser().resolve()
+    if not input_file.is_file():
+        parser.error(f"Input file does not exist: {input_file}")
+
+    server = resolve_server(args)
+    endpoint = f"{server}/transcriptions"
+
+    with input_file.open("rb") as handle, httpx.Client(timeout=300.0) as client:
+        response = client.post(
+            endpoint,
+            data={
+                "model": args.model,
+                "language": args.language or "",
+                "output_format": args.output_format,
+            },
+            files={"file": (input_file.name, handle, "application/octet-stream")},
+        )
+
+    try:
+        response.raise_for_status()
+    except httpx.HTTPStatusError as exc:
+        message = exc.response.text.strip() or str(exc)
+        parser.exit(1, f"{message}\n")
+
+    if args.to_file:
+        destination = infer_output_path(args.to_file.expanduser(), input_file, args.output_format)
+        save_response(response, destination)
+        sys.stdout.write(f"{destination}\n")
+    else:
+        print_response(response)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/cli/tests/conftest.py
+++ b/cli/tests/conftest.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SRC = ROOT / "src"
+
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
--- a/cli/tests/test_main.py
+++ b/cli/tests/test_main.py
@@ -0,0 +1,28 @@
+import os
+from argparse import Namespace
+from pathlib import Path
+
+import pytest
+
+from whisper_remote_cli import main
+
+
+def test_resolve_server_from_env(monkeypatch) -> None:
+    monkeypatch.setenv("WHISPER_REMOTE", "http://localhost:8000/")
+    assert main.resolve_server(Namespace(server=None)) == "http://localhost:8000"
+
+
+def test_resolve_server_requires_value(monkeypatch) -> None:
+    monkeypatch.delenv("WHISPER_REMOTE", raising=False)
+    with pytest.raises(SystemExit):
+        main.resolve_server(Namespace(server=None))
+
+
+def test_infer_output_path_for_directory(tmp_path: Path) -> None:
+    destination = main.infer_output_path(tmp_path, Path("clip.wav"), "srt")
+    assert destination == tmp_path / "clip.srt"
+
+
+def test_infer_output_path_for_explicit_file(tmp_path: Path) -> None:
+    destination = main.infer_output_path(tmp_path / "custom-name.txt", Path("clip.wav"), "txt")
+    assert destination == tmp_path / "custom-name.txt"