From aa895905551e03b4634db01ae69d2791d199e9ac Mon Sep 17 00:00:00 2001 From: space Date: Sun, 24 May 2026 12:36:24 +0200 Subject: [PATCH] Add whisper remote backend and CLI --- .gitignore | 3 + backend/README.md | 23 +++ backend/pyproject.toml | 30 ++++ .../src/whisper_remote_backend/__init__.py | 1 + backend/src/whisper_remote_backend/server.py | 144 ++++++++++++++++++ backend/tests/test_server.py | 63 ++++++++ cli/README.md | 13 ++ cli/pyproject.toml | 27 ++++ cli/src/whisper_remote_cli/__init__.py | 1 + cli/src/whisper_remote_cli/main.py | 101 ++++++++++++ cli/tests/conftest.py | 11 ++ cli/tests/test_main.py | 28 ++++ 12 files changed, 445 insertions(+) create mode 100644 .gitignore create mode 100644 backend/README.md create mode 100644 backend/pyproject.toml create mode 100644 backend/src/whisper_remote_backend/__init__.py create mode 100644 backend/src/whisper_remote_backend/server.py create mode 100644 backend/tests/test_server.py create mode 100644 cli/README.md create mode 100644 cli/pyproject.toml create mode 100644 cli/src/whisper_remote_cli/__init__.py create mode 100644 cli/src/whisper_remote_cli/main.py create mode 100644 cli/tests/conftest.py create mode 100644 cli/tests/test_main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d9f2830 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +.pytest_cache/ +*.egg-info/ diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 0000000..574906e --- /dev/null +++ b/backend/README.md @@ -0,0 +1,23 @@ +# whisper-remote-backend + +FastAPI wrapper around the upstream `whisper` CLI from `openai/whisper`. + +## Run + +```bash +pip install -e . +uvicorn whisper_remote_backend.server:app --host 0.0.0.0 --port 8000 +``` + +## API + +`POST /transcriptions` + +Multipart form fields: + +- `file`: media file +- `model`: Whisper model name +- `language`: optional language code +- `output_format`: `txt`, `vtt`, `srt`, `tsv`, or `json` + +The response body is the transcript artifact itself. The backend deletes the uploaded file and generated output after each request. diff --git a/backend/pyproject.toml b/backend/pyproject.toml new file mode 100644 index 0000000..263cffa --- /dev/null +++ b/backend/pyproject.toml @@ -0,0 +1,30 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "whisper-remote-backend" +version = "0.1.0" +description = "FastAPI wrapper around the openai/whisper CLI" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "fastapi>=0.115.0,<1.0.0", + "python-multipart>=0.0.9,<1.0.0", + "uvicorn>=0.32.0,<1.0.0", +] + +[project.scripts] +whisper-remote-server = "whisper_remote_backend.server:main" + +[project.optional-dependencies] +dev = [ + "httpx>=0.28.0,<1.0.0", + "pytest>=8.3.0,<9.0.0", +] + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/backend/src/whisper_remote_backend/__init__.py b/backend/src/whisper_remote_backend/__init__.py new file mode 100644 index 0000000..5ca4deb --- /dev/null +++ b/backend/src/whisper_remote_backend/__init__.py @@ -0,0 +1 @@ +"""whisper-remote backend package.""" diff --git a/backend/src/whisper_remote_backend/server.py b/backend/src/whisper_remote_backend/server.py new file mode 100644 index 0000000..1368c91 --- /dev/null +++ b/backend/src/whisper_remote_backend/server.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import subprocess +from pathlib import Path +from tempfile import TemporaryDirectory + +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from fastapi.responses import Response + +SUPPORTED_FORMATS = {"txt", "vtt", "srt", "tsv", "json"} +CONTENT_TYPES = { + "txt": "text/plain; charset=utf-8", + "vtt": "text/vtt; charset=utf-8", + "srt": "application/x-subrip; charset=utf-8", + "tsv": "text/tab-separated-values; charset=utf-8", + "json": "application/json; charset=utf-8", +} + +app = FastAPI(title="whisper-remote-backend") + + +def validate_output_format(output_format: str) -> str: + normalized = output_format.strip().lower() + if normalized not in SUPPORTED_FORMATS: + supported = ", ".join(sorted(SUPPORTED_FORMATS)) + raise HTTPException( + status_code=400, + detail=f"Unsupported output format '{output_format}'. Supported formats: {supported}.", + ) + return normalized + + +def build_whisper_command( + *, + input_path: Path, + output_dir: Path, + model: str, + language: str | None, + output_format: str, +) -> list[str]: + command = [ + "whisper", + str(input_path), + "--model", + model, + "--output_format", + output_format, + "--output_dir", + str(output_dir), + ] + if language: + command.extend(["--language", language]) + return command + + +async def save_upload(upload: UploadFile, destination: Path) -> None: + with destination.open("wb") as handle: + while chunk := await upload.read(1024 * 1024): + handle.write(chunk) + await upload.close() + + +def find_transcript_file(output_dir: Path, input_name: str, output_format: str) -> Path: + expected = output_dir / f"{Path(input_name).stem}.{output_format}" + if expected.exists(): + return expected + + matches = list(output_dir.glob(f"*.{output_format}")) + if len(matches) == 1: + return matches[0] + + raise HTTPException( + status_code=500, + detail="Whisper finished without producing the expected output file.", + ) + + +@app.get("/health") +def healthcheck() -> dict[str, str]: + return {"status": "ok"} + + +@app.post("/transcriptions") +async def transcribe( + file: UploadFile = File(...), + model: str = Form(...), + language: str | None = Form(default=None), + output_format: str = Form(...), +) -> Response: + normalized_format = validate_output_format(output_format) + if not file.filename: + raise HTTPException(status_code=400, detail="Uploaded file must have a filename.") + + with TemporaryDirectory(prefix="whisper-remote-upload-") as upload_root, TemporaryDirectory( + prefix="whisper-remote-output-" + ) as output_root: + input_path = Path(upload_root) / Path(file.filename).name + output_dir = Path(output_root) + await save_upload(file, input_path) + + command = build_whisper_command( + input_path=input_path, + output_dir=output_dir, + model=model, + language=language, + output_format=normalized_format, + ) + + try: + completed = subprocess.run( + command, + check=False, + capture_output=True, + text=True, + ) + except FileNotFoundError as exc: + raise HTTPException( + status_code=500, + detail="The 'whisper' CLI was not found on PATH on the backend host.", + ) from exc + + if completed.returncode != 0: + detail = completed.stderr.strip() or completed.stdout.strip() or "Whisper CLI failed." + raise HTTPException(status_code=502, detail=detail) + + transcript_path = find_transcript_file(output_dir, file.filename, normalized_format) + content = transcript_path.read_bytes() + download_name = f"{Path(file.filename).stem}.{normalized_format}" + + return Response( + content=content, + media_type=CONTENT_TYPES[normalized_format], + headers={ + "Content-Disposition": f'attachment; filename="{download_name}"', + "X-Whisper-Output-Format": normalized_format, + "X-Whisper-Model": model, + }, + ) + + +def main() -> None: + import uvicorn + + uvicorn.run("whisper_remote_backend.server:app", host="0.0.0.0", port=8000) diff --git a/backend/tests/test_server.py b/backend/tests/test_server.py new file mode 100644 index 0000000..d28fdff --- /dev/null +++ b/backend/tests/test_server.py @@ -0,0 +1,63 @@ +from pathlib import Path + +from fastapi.testclient import TestClient + +from whisper_remote_backend import server + + +client = TestClient(server.app) + + +def test_validate_output_format_rejects_unknown() -> None: + try: + server.validate_output_format("docx") + except Exception as exc: # pragma: no cover - structure assertion below + assert getattr(exc, "status_code", None) == 400 + else: # pragma: no cover + raise AssertionError("Expected HTTPException") + + +def test_transcriptions_returns_generated_artifact(monkeypatch, tmp_path: Path) -> None: + def fake_run(command: list[str], check: bool, capture_output: bool, text: bool): + output_dir = Path(command[command.index("--output_dir") + 1]) + (output_dir / "clip.txt").write_text("hello world", encoding="utf-8") + + class Result: + returncode = 0 + stdout = "" + stderr = "" + + return Result() + + monkeypatch.setattr(server.subprocess, "run", fake_run) + + response = client.post( + "/transcriptions", + data={"model": "base", "language": "en", "output_format": "txt"}, + files={"file": ("clip.wav", b"audio", "audio/wav")}, + ) + + assert response.status_code == 200 + assert response.text == "hello world" + assert response.headers["x-whisper-output-format"] == "txt" + + +def test_transcriptions_maps_subprocess_failure(monkeypatch) -> None: + def fake_run(command: list[str], check: bool, capture_output: bool, text: bool): + class Result: + returncode = 1 + stdout = "" + stderr = "bad whisper day" + + return Result() + + monkeypatch.setattr(server.subprocess, "run", fake_run) + + response = client.post( + "/transcriptions", + data={"model": "base", "output_format": "txt"}, + files={"file": ("clip.wav", b"audio", "audio/wav")}, + ) + + assert response.status_code == 502 + assert response.json()["detail"] == "bad whisper day" diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 0000000..6b3b894 --- /dev/null +++ b/cli/README.md @@ -0,0 +1,13 @@ +# whisper-remote + +Local CLI that forwards media files to a remote `whisper-remote-backend` server. + +## Run + +```bash +pip install -e . +export WHISPER_REMOTE=http://127.0.0.1:8000 +whisper-remote ./audio.mp3 --model base --language en --output-format txt +``` + +Use `--to-file` to save the returned transcript locally. diff --git a/cli/pyproject.toml b/cli/pyproject.toml new file mode 100644 index 0000000..14ec2e6 --- /dev/null +++ b/cli/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "whisper-remote-cli" +version = "0.1.0" +description = "CLI that forwards transcription requests to whisper-remote-backend" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "httpx>=0.28.0,<1.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.3.0,<9.0.0", +] + +[project.scripts] +whisper-remote = "whisper_remote_cli.main:main" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/cli/src/whisper_remote_cli/__init__.py b/cli/src/whisper_remote_cli/__init__.py new file mode 100644 index 0000000..0608698 --- /dev/null +++ b/cli/src/whisper_remote_cli/__init__.py @@ -0,0 +1 @@ +"""whisper-remote CLI package.""" diff --git a/cli/src/whisper_remote_cli/main.py b/cli/src/whisper_remote_cli/main.py new file mode 100644 index 0000000..277c741 --- /dev/null +++ b/cli/src/whisper_remote_cli/main.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +import httpx + +SUPPORTED_FORMATS = ("txt", "vtt", "srt", "tsv", "json") + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Send transcription jobs to a remote Whisper backend.") + parser.add_argument("file", type=Path, help="Path to the local media file to upload.") + parser.add_argument("--model", required=True, help="Whisper model name to use on the backend.") + parser.add_argument("--language", help="Optional language code to pass through to Whisper.") + parser.add_argument( + "--output-format", + default="txt", + choices=SUPPORTED_FORMATS, + help="Transcript artifact format returned by the backend.", + ) + parser.add_argument( + "--server", + help="Override the backend base URL. Defaults to the WHISPER_REMOTE environment variable.", + ) + parser.add_argument( + "--to-file", + type=Path, + help="Optional local file path or directory to save the returned transcript artifact.", + ) + return parser + + +def resolve_server(args: argparse.Namespace) -> str: + server = args.server or os.environ.get("WHISPER_REMOTE") + if not server: + raise SystemExit("WHISPER_REMOTE is not set and --server was not provided.") + return server.rstrip("/") + + +def infer_output_path(target: Path, input_file: Path, output_format: str) -> Path: + if target.exists() and target.is_dir(): + return target / f"{input_file.stem}.{output_format}" + if target.suffix: + return target + return target / f"{input_file.stem}.{output_format}" + + +def print_response(response: httpx.Response) -> None: + sys.stdout.write(response.text) + if response.text and not response.text.endswith("\n"): + sys.stdout.write("\n") + + +def save_response(response: httpx.Response, destination: Path) -> None: + destination.parent.mkdir(parents=True, exist_ok=True) + destination.write_bytes(response.content) + + +def main() -> int: + parser = build_parser() + args = parser.parse_args() + + input_file = args.file.expanduser().resolve() + if not input_file.is_file(): + parser.error(f"Input file does not exist: {input_file}") + + server = resolve_server(args) + endpoint = f"{server}/transcriptions" + + with input_file.open("rb") as handle, httpx.Client(timeout=300.0) as client: + response = client.post( + endpoint, + data={ + "model": args.model, + "language": args.language or "", + "output_format": args.output_format, + }, + files={"file": (input_file.name, handle, "application/octet-stream")}, + ) + + try: + response.raise_for_status() + except httpx.HTTPStatusError as exc: + message = exc.response.text.strip() or str(exc) + parser.exit(1, f"{message}\n") + + if args.to_file: + destination = infer_output_path(args.to_file.expanduser(), input_file, args.output_format) + save_response(response, destination) + sys.stdout.write(f"{destination}\n") + else: + print_response(response) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/cli/tests/conftest.py b/cli/tests/conftest.py new file mode 100644 index 0000000..df150dc --- /dev/null +++ b/cli/tests/conftest.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +import sys +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SRC = ROOT / "src" + +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) diff --git a/cli/tests/test_main.py b/cli/tests/test_main.py new file mode 100644 index 0000000..180642a --- /dev/null +++ b/cli/tests/test_main.py @@ -0,0 +1,28 @@ +import os +from argparse import Namespace +from pathlib import Path + +import pytest + +from whisper_remote_cli import main + + +def test_resolve_server_from_env(monkeypatch) -> None: + monkeypatch.setenv("WHISPER_REMOTE", "http://localhost:8000/") + assert main.resolve_server(Namespace(server=None)) == "http://localhost:8000" + + +def test_resolve_server_requires_value(monkeypatch) -> None: + monkeypatch.delenv("WHISPER_REMOTE", raising=False) + with pytest.raises(SystemExit): + main.resolve_server(Namespace(server=None)) + + +def test_infer_output_path_for_directory(tmp_path: Path) -> None: + destination = main.infer_output_path(tmp_path, Path("clip.wav"), "srt") + assert destination == tmp_path / "clip.srt" + + +def test_infer_output_path_for_explicit_file(tmp_path: Path) -> None: + destination = main.infer_output_path(tmp_path / "custom-name.txt", Path("clip.wav"), "txt") + assert destination == tmp_path / "custom-name.txt"