Add whisper remote backend and CLI

2026-05-24 12:36:24 +02:00
parent ad10a62ae6
commit aa89590555
12 changed files with 445 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 __pycache__/
 .pytest_cache/
 *.egg-info/
--- a/backend/README.md
+++ b/backend/README.md
@@ -0,0 +1,23 @@
 # whisper-remote-backend
 FastAPI wrapper around the upstream `whisper` CLI from `openai/whisper`.
 ## Run
 ```bash
 pip install -e .
 uvicorn whisper_remote_backend.server:app --host 0.0.0.0 --port 8000
 ```
 ## API
 `POST /transcriptions`
 Multipart form fields:
 - `file`: media file
 - `model`: Whisper model name
 - `language`: optional language code
 - `output_format`: `txt`, `vtt`, `srt`, `tsv`, or `json`
 The response body is the transcript artifact itself. The backend deletes the uploaded file and generated output after each request.
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -0,0 +1,30 @@
 [build-system]
 requires = ["setuptools>=68", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "whisper-remote-backend"
 version = "0.1.0"
 description = "FastAPI wrapper around the openai/whisper CLI"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
  "fastapi>=0.115.0,<1.0.0",
  "python-multipart>=0.0.9,<1.0.0",
  "uvicorn>=0.32.0,<1.0.0",
 ]
 [project.scripts]
 whisper-remote-server = "whisper_remote_backend.server:main"
 [project.optional-dependencies]
 dev = [
  "httpx>=0.28.0,<1.0.0",
  "pytest>=8.3.0,<9.0.0",
 ]
 [tool.setuptools]
 package-dir = {"" = "src"}
 [tool.setuptools.packages.find]
 where = ["src"]
--- a/backend/src/whisper_remote_backend/init.py
+++ b/backend/src/whisper_remote_backend/init.py
@@ -0,0 +1 @@
 """whisper-remote backend package."""
--- a/backend/src/whisper_remote_backend/server.py
+++ b/backend/src/whisper_remote_backend/server.py
@@ -0,0 +1,144 @@
 from __future__ import annotations
 import subprocess
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.responses import Response
 SUPPORTED_FORMATS = {"txt", "vtt", "srt", "tsv", "json"}
 CONTENT_TYPES = {
    "txt": "text/plain; charset=utf-8",
    "vtt": "text/vtt; charset=utf-8",
    "srt": "application/x-subrip; charset=utf-8",
    "tsv": "text/tab-separated-values; charset=utf-8",
    "json": "application/json; charset=utf-8",
 }
 app = FastAPI(title="whisper-remote-backend")
 def validate_output_format(output_format: str) -> str:
    normalized = output_format.strip().lower()
    if normalized not in SUPPORTED_FORMATS:
        supported = ", ".join(sorted(SUPPORTED_FORMATS))
        raise HTTPException(
            status_code=400,
            detail=f"Unsupported output format '{output_format}'. Supported formats: {supported}.",
        )
    return normalized
 def build_whisper_command(
    *,
    input_path: Path,
    output_dir: Path,
    model: str,
    language: str | None,
    output_format: str,
 ) -> list[str]:
    command = [
        "whisper",
        str(input_path),
        "--model",
        model,
        "--output_format",
        output_format,
        "--output_dir",
        str(output_dir),
    ]
    if language:
        command.extend(["--language", language])
    return command
 async def save_upload(upload: UploadFile, destination: Path) -> None:
    with destination.open("wb") as handle:
        while chunk := await upload.read(1024 * 1024):
            handle.write(chunk)
    await upload.close()
 def find_transcript_file(output_dir: Path, input_name: str, output_format: str) -> Path:
    expected = output_dir / f"{Path(input_name).stem}.{output_format}"
    if expected.exists():
        return expected
    matches = list(output_dir.glob(f"*.{output_format}"))
    if len(matches) == 1:
        return matches[0]
    raise HTTPException(
        status_code=500,
        detail="Whisper finished without producing the expected output file.",
    )
@app.get("/health")
 def healthcheck() -> dict[str, str]:
    return {"status": "ok"}
@app.post("/transcriptions")
 async def transcribe(
    file: UploadFile = File(...),
    model: str = Form(...),
    language: str | None = Form(default=None),
    output_format: str = Form(...),
 ) -> Response:
    normalized_format = validate_output_format(output_format)
    if not file.filename:
        raise HTTPException(status_code=400, detail="Uploaded file must have a filename.")
    with TemporaryDirectory(prefix="whisper-remote-upload-") as upload_root, TemporaryDirectory(
        prefix="whisper-remote-output-"
    ) as output_root:
        input_path = Path(upload_root) / Path(file.filename).name
        output_dir = Path(output_root)
        await save_upload(file, input_path)
        command = build_whisper_command(
            input_path=input_path,
            output_dir=output_dir,
            model=model,
            language=language,
            output_format=normalized_format,
        )
        try:
            completed = subprocess.run(
                command,
                check=False,
                capture_output=True,
                text=True,
            )
        except FileNotFoundError as exc:
            raise HTTPException(
                status_code=500,
                detail="The 'whisper' CLI was not found on PATH on the backend host.",
            ) from exc
        if completed.returncode != 0:
            detail = completed.stderr.strip() or completed.stdout.strip() or "Whisper CLI failed."
            raise HTTPException(status_code=502, detail=detail)
        transcript_path = find_transcript_file(output_dir, file.filename, normalized_format)
        content = transcript_path.read_bytes()
        download_name = f"{Path(file.filename).stem}.{normalized_format}"
        return Response(
            content=content,
            media_type=CONTENT_TYPES[normalized_format],
            headers={
                "Content-Disposition": f'attachment; filename="{download_name}"',
                "X-Whisper-Output-Format": normalized_format,
                "X-Whisper-Model": model,
            },
        )
 def main() -> None:
    import uvicorn
    uvicorn.run("whisper_remote_backend.server:app", host="0.0.0.0", port=8000)
--- a/backend/tests/test_server.py
+++ b/backend/tests/test_server.py
@@ -0,0 +1,63 @@
 from pathlib import Path
 from fastapi.testclient import TestClient
 from whisper_remote_backend import server
 client = TestClient(server.app)
 def test_validate_output_format_rejects_unknown() -> None:
    try:
        server.validate_output_format("docx")
    except Exception as exc:  # pragma: no cover - structure assertion below
        assert getattr(exc, "status_code", None) == 400
    else:  # pragma: no cover
        raise AssertionError("Expected HTTPException")
 def test_transcriptions_returns_generated_artifact(monkeypatch, tmp_path: Path) -> None:
    def fake_run(command: list[str], check: bool, capture_output: bool, text: bool):
        output_dir = Path(command[command.index("--output_dir") + 1])
        (output_dir / "clip.txt").write_text("hello world", encoding="utf-8")
        class Result:
            returncode = 0
            stdout = ""
            stderr = ""
        return Result()
    monkeypatch.setattr(server.subprocess, "run", fake_run)
    response = client.post(
        "/transcriptions",
        data={"model": "base", "language": "en", "output_format": "txt"},
        files={"file": ("clip.wav", b"audio", "audio/wav")},
    )
    assert response.status_code == 200
    assert response.text == "hello world"
    assert response.headers["x-whisper-output-format"] == "txt"
 def test_transcriptions_maps_subprocess_failure(monkeypatch) -> None:
    def fake_run(command: list[str], check: bool, capture_output: bool, text: bool):
        class Result:
            returncode = 1
            stdout = ""
            stderr = "bad whisper day"
        return Result()
    monkeypatch.setattr(server.subprocess, "run", fake_run)
    response = client.post(
        "/transcriptions",
        data={"model": "base", "output_format": "txt"},
        files={"file": ("clip.wav", b"audio", "audio/wav")},
    )
    assert response.status_code == 502
    assert response.json()["detail"] == "bad whisper day"
--- a/cli/README.md
+++ b/cli/README.md
@@ -0,0 +1,13 @@
 # whisper-remote
 Local CLI that forwards media files to a remote `whisper-remote-backend` server.
 ## Run
 ```bash
 pip install -e .
 export WHISPER_REMOTE=http://127.0.0.1:8000
 whisper-remote ./audio.mp3 --model base --language en --output-format txt
 ```
 Use `--to-file` to save the returned transcript locally.
--- a/cli/pyproject.toml
+++ b/cli/pyproject.toml
@@ -0,0 +1,27 @@
 [build-system]
 requires = ["setuptools>=68", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "whisper-remote-cli"
 version = "0.1.0"
 description = "CLI that forwards transcription requests to whisper-remote-backend"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
  "httpx>=0.28.0,<1.0.0",
 ]
 [project.optional-dependencies]
 dev = [
  "pytest>=8.3.0,<9.0.0",
 ]
 [project.scripts]
 whisper-remote = "whisper_remote_cli.main:main"
 [tool.setuptools]
 package-dir = {"" = "src"}
 [tool.setuptools.packages.find]
 where = ["src"]
--- a/cli/src/whisper_remote_cli/init.py
+++ b/cli/src/whisper_remote_cli/init.py
@@ -0,0 +1 @@
 """whisper-remote CLI package."""
--- a/cli/src/whisper_remote_cli/main.py
+++ b/cli/src/whisper_remote_cli/main.py
@@ -0,0 +1,101 @@
 from __future__ import annotations
 import argparse
 import os
 import sys
 from pathlib import Path
 import httpx
 SUPPORTED_FORMATS = ("txt", "vtt", "srt", "tsv", "json")
 def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Send transcription jobs to a remote Whisper backend.")
    parser.add_argument("file", type=Path, help="Path to the local media file to upload.")
    parser.add_argument("--model", required=True, help="Whisper model name to use on the backend.")
    parser.add_argument("--language", help="Optional language code to pass through to Whisper.")
    parser.add_argument(
        "--output-format",
        default="txt",
        choices=SUPPORTED_FORMATS,
        help="Transcript artifact format returned by the backend.",
    )
    parser.add_argument(
        "--server",
        help="Override the backend base URL. Defaults to the WHISPER_REMOTE environment variable.",
    )
    parser.add_argument(
        "--to-file",
        type=Path,
        help="Optional local file path or directory to save the returned transcript artifact.",
    )
    return parser
 def resolve_server(args: argparse.Namespace) -> str:
    server = args.server or os.environ.get("WHISPER_REMOTE")
    if not server:
        raise SystemExit("WHISPER_REMOTE is not set and --server was not provided.")
    return server.rstrip("/")
 def infer_output_path(target: Path, input_file: Path, output_format: str) -> Path:
    if target.exists() and target.is_dir():
        return target / f"{input_file.stem}.{output_format}"
    if target.suffix:
        return target
    return target / f"{input_file.stem}.{output_format}"
 def print_response(response: httpx.Response) -> None:
    sys.stdout.write(response.text)
    if response.text and not response.text.endswith("\n"):
        sys.stdout.write("\n")
 def save_response(response: httpx.Response, destination: Path) -> None:
    destination.parent.mkdir(parents=True, exist_ok=True)
    destination.write_bytes(response.content)
 def main() -> int:
    parser = build_parser()
    args = parser.parse_args()
    input_file = args.file.expanduser().resolve()
    if not input_file.is_file():
        parser.error(f"Input file does not exist: {input_file}")
    server = resolve_server(args)
    endpoint = f"{server}/transcriptions"
    with input_file.open("rb") as handle, httpx.Client(timeout=300.0) as client:
        response = client.post(
            endpoint,
            data={
                "model": args.model,
                "language": args.language or "",
                "output_format": args.output_format,
            },
            files={"file": (input_file.name, handle, "application/octet-stream")},
        )
    try:
        response.raise_for_status()
    except httpx.HTTPStatusError as exc:
        message = exc.response.text.strip() or str(exc)
        parser.exit(1, f"{message}\n")
    if args.to_file:
        destination = infer_output_path(args.to_file.expanduser(), input_file, args.output_format)
        save_response(response, destination)
        sys.stdout.write(f"{destination}\n")
    else:
        print_response(response)
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/cli/tests/conftest.py
+++ b/cli/tests/conftest.py
@@ -0,0 +1,11 @@
 from __future__ import annotations
 import sys
 from pathlib import Path
 ROOT = Path(__file__).resolve().parents[1]
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
--- a/cli/tests/test_main.py
+++ b/cli/tests/test_main.py
@@ -0,0 +1,28 @@
 import os
 from argparse import Namespace
 from pathlib import Path
 import pytest
 from whisper_remote_cli import main
 def test_resolve_server_from_env(monkeypatch) -> None:
    monkeypatch.setenv("WHISPER_REMOTE", "http://localhost:8000/")
    assert main.resolve_server(Namespace(server=None)) == "http://localhost:8000"
 def test_resolve_server_requires_value(monkeypatch) -> None:
    monkeypatch.delenv("WHISPER_REMOTE", raising=False)
    with pytest.raises(SystemExit):
        main.resolve_server(Namespace(server=None))
 def test_infer_output_path_for_directory(tmp_path: Path) -> None:
    destination = main.infer_output_path(tmp_path, Path("clip.wav"), "srt")
    assert destination == tmp_path / "clip.srt"
 def test_infer_output_path_for_explicit_file(tmp_path: Path) -> None:
    destination = main.infer_output_path(tmp_path / "custom-name.txt", Path("clip.wav"), "txt")
    assert destination == tmp_path / "custom-name.txt"