Add whisper remote backend and CLI
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
__pycache__/
|
||||||
|
.pytest_cache/
|
||||||
|
*.egg-info/
|
||||||
23
backend/README.md
Normal file
23
backend/README.md
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# whisper-remote-backend
|
||||||
|
|
||||||
|
FastAPI wrapper around the upstream `whisper` CLI from `openai/whisper`.
|
||||||
|
|
||||||
|
## Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -e .
|
||||||
|
uvicorn whisper_remote_backend.server:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
## API
|
||||||
|
|
||||||
|
`POST /transcriptions`
|
||||||
|
|
||||||
|
Multipart form fields:
|
||||||
|
|
||||||
|
- `file`: media file
|
||||||
|
- `model`: Whisper model name
|
||||||
|
- `language`: optional language code
|
||||||
|
- `output_format`: `txt`, `vtt`, `srt`, `tsv`, or `json`
|
||||||
|
|
||||||
|
The response body is the transcript artifact itself. The backend deletes the uploaded file and generated output after each request.
|
||||||
30
backend/pyproject.toml
Normal file
30
backend/pyproject.toml
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=68", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "whisper-remote-backend"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "FastAPI wrapper around the openai/whisper CLI"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"fastapi>=0.115.0,<1.0.0",
|
||||||
|
"python-multipart>=0.0.9,<1.0.0",
|
||||||
|
"uvicorn>=0.32.0,<1.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
whisper-remote-server = "whisper_remote_backend.server:main"
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"httpx>=0.28.0,<1.0.0",
|
||||||
|
"pytest>=8.3.0,<9.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
package-dir = {"" = "src"}
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
1
backend/src/whisper_remote_backend/__init__.py
Normal file
1
backend/src/whisper_remote_backend/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""whisper-remote backend package."""
|
||||||
144
backend/src/whisper_remote_backend/server.py
Normal file
144
backend/src/whisper_remote_backend/server.py
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
|
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
||||||
|
from fastapi.responses import Response
|
||||||
|
|
||||||
|
SUPPORTED_FORMATS = {"txt", "vtt", "srt", "tsv", "json"}
|
||||||
|
CONTENT_TYPES = {
|
||||||
|
"txt": "text/plain; charset=utf-8",
|
||||||
|
"vtt": "text/vtt; charset=utf-8",
|
||||||
|
"srt": "application/x-subrip; charset=utf-8",
|
||||||
|
"tsv": "text/tab-separated-values; charset=utf-8",
|
||||||
|
"json": "application/json; charset=utf-8",
|
||||||
|
}
|
||||||
|
|
||||||
|
app = FastAPI(title="whisper-remote-backend")
|
||||||
|
|
||||||
|
|
||||||
|
def validate_output_format(output_format: str) -> str:
|
||||||
|
normalized = output_format.strip().lower()
|
||||||
|
if normalized not in SUPPORTED_FORMATS:
|
||||||
|
supported = ", ".join(sorted(SUPPORTED_FORMATS))
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Unsupported output format '{output_format}'. Supported formats: {supported}.",
|
||||||
|
)
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def build_whisper_command(
|
||||||
|
*,
|
||||||
|
input_path: Path,
|
||||||
|
output_dir: Path,
|
||||||
|
model: str,
|
||||||
|
language: str | None,
|
||||||
|
output_format: str,
|
||||||
|
) -> list[str]:
|
||||||
|
command = [
|
||||||
|
"whisper",
|
||||||
|
str(input_path),
|
||||||
|
"--model",
|
||||||
|
model,
|
||||||
|
"--output_format",
|
||||||
|
output_format,
|
||||||
|
"--output_dir",
|
||||||
|
str(output_dir),
|
||||||
|
]
|
||||||
|
if language:
|
||||||
|
command.extend(["--language", language])
|
||||||
|
return command
|
||||||
|
|
||||||
|
|
||||||
|
async def save_upload(upload: UploadFile, destination: Path) -> None:
|
||||||
|
with destination.open("wb") as handle:
|
||||||
|
while chunk := await upload.read(1024 * 1024):
|
||||||
|
handle.write(chunk)
|
||||||
|
await upload.close()
|
||||||
|
|
||||||
|
|
||||||
|
def find_transcript_file(output_dir: Path, input_name: str, output_format: str) -> Path:
|
||||||
|
expected = output_dir / f"{Path(input_name).stem}.{output_format}"
|
||||||
|
if expected.exists():
|
||||||
|
return expected
|
||||||
|
|
||||||
|
matches = list(output_dir.glob(f"*.{output_format}"))
|
||||||
|
if len(matches) == 1:
|
||||||
|
return matches[0]
|
||||||
|
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail="Whisper finished without producing the expected output file.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def healthcheck() -> dict[str, str]:
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/transcriptions")
|
||||||
|
async def transcribe(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
model: str = Form(...),
|
||||||
|
language: str | None = Form(default=None),
|
||||||
|
output_format: str = Form(...),
|
||||||
|
) -> Response:
|
||||||
|
normalized_format = validate_output_format(output_format)
|
||||||
|
if not file.filename:
|
||||||
|
raise HTTPException(status_code=400, detail="Uploaded file must have a filename.")
|
||||||
|
|
||||||
|
with TemporaryDirectory(prefix="whisper-remote-upload-") as upload_root, TemporaryDirectory(
|
||||||
|
prefix="whisper-remote-output-"
|
||||||
|
) as output_root:
|
||||||
|
input_path = Path(upload_root) / Path(file.filename).name
|
||||||
|
output_dir = Path(output_root)
|
||||||
|
await save_upload(file, input_path)
|
||||||
|
|
||||||
|
command = build_whisper_command(
|
||||||
|
input_path=input_path,
|
||||||
|
output_dir=output_dir,
|
||||||
|
model=model,
|
||||||
|
language=language,
|
||||||
|
output_format=normalized_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
completed = subprocess.run(
|
||||||
|
command,
|
||||||
|
check=False,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
except FileNotFoundError as exc:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail="The 'whisper' CLI was not found on PATH on the backend host.",
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
if completed.returncode != 0:
|
||||||
|
detail = completed.stderr.strip() or completed.stdout.strip() or "Whisper CLI failed."
|
||||||
|
raise HTTPException(status_code=502, detail=detail)
|
||||||
|
|
||||||
|
transcript_path = find_transcript_file(output_dir, file.filename, normalized_format)
|
||||||
|
content = transcript_path.read_bytes()
|
||||||
|
download_name = f"{Path(file.filename).stem}.{normalized_format}"
|
||||||
|
|
||||||
|
return Response(
|
||||||
|
content=content,
|
||||||
|
media_type=CONTENT_TYPES[normalized_format],
|
||||||
|
headers={
|
||||||
|
"Content-Disposition": f'attachment; filename="{download_name}"',
|
||||||
|
"X-Whisper-Output-Format": normalized_format,
|
||||||
|
"X-Whisper-Model": model,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
uvicorn.run("whisper_remote_backend.server:app", host="0.0.0.0", port=8000)
|
||||||
63
backend/tests/test_server.py
Normal file
63
backend/tests/test_server.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from whisper_remote_backend import server
|
||||||
|
|
||||||
|
|
||||||
|
client = TestClient(server.app)
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_output_format_rejects_unknown() -> None:
|
||||||
|
try:
|
||||||
|
server.validate_output_format("docx")
|
||||||
|
except Exception as exc: # pragma: no cover - structure assertion below
|
||||||
|
assert getattr(exc, "status_code", None) == 400
|
||||||
|
else: # pragma: no cover
|
||||||
|
raise AssertionError("Expected HTTPException")
|
||||||
|
|
||||||
|
|
||||||
|
def test_transcriptions_returns_generated_artifact(monkeypatch, tmp_path: Path) -> None:
|
||||||
|
def fake_run(command: list[str], check: bool, capture_output: bool, text: bool):
|
||||||
|
output_dir = Path(command[command.index("--output_dir") + 1])
|
||||||
|
(output_dir / "clip.txt").write_text("hello world", encoding="utf-8")
|
||||||
|
|
||||||
|
class Result:
|
||||||
|
returncode = 0
|
||||||
|
stdout = ""
|
||||||
|
stderr = ""
|
||||||
|
|
||||||
|
return Result()
|
||||||
|
|
||||||
|
monkeypatch.setattr(server.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
"/transcriptions",
|
||||||
|
data={"model": "base", "language": "en", "output_format": "txt"},
|
||||||
|
files={"file": ("clip.wav", b"audio", "audio/wav")},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.text == "hello world"
|
||||||
|
assert response.headers["x-whisper-output-format"] == "txt"
|
||||||
|
|
||||||
|
|
||||||
|
def test_transcriptions_maps_subprocess_failure(monkeypatch) -> None:
|
||||||
|
def fake_run(command: list[str], check: bool, capture_output: bool, text: bool):
|
||||||
|
class Result:
|
||||||
|
returncode = 1
|
||||||
|
stdout = ""
|
||||||
|
stderr = "bad whisper day"
|
||||||
|
|
||||||
|
return Result()
|
||||||
|
|
||||||
|
monkeypatch.setattr(server.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
response = client.post(
|
||||||
|
"/transcriptions",
|
||||||
|
data={"model": "base", "output_format": "txt"},
|
||||||
|
files={"file": ("clip.wav", b"audio", "audio/wav")},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert response.status_code == 502
|
||||||
|
assert response.json()["detail"] == "bad whisper day"
|
||||||
13
cli/README.md
Normal file
13
cli/README.md
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# whisper-remote
|
||||||
|
|
||||||
|
Local CLI that forwards media files to a remote `whisper-remote-backend` server.
|
||||||
|
|
||||||
|
## Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -e .
|
||||||
|
export WHISPER_REMOTE=http://127.0.0.1:8000
|
||||||
|
whisper-remote ./audio.mp3 --model base --language en --output-format txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `--to-file` to save the returned transcript locally.
|
||||||
27
cli/pyproject.toml
Normal file
27
cli/pyproject.toml
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=68", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "whisper-remote-cli"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "CLI that forwards transcription requests to whisper-remote-backend"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"httpx>=0.28.0,<1.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.3.0,<9.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
whisper-remote = "whisper_remote_cli.main:main"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
package-dir = {"" = "src"}
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
1
cli/src/whisper_remote_cli/__init__.py
Normal file
1
cli/src/whisper_remote_cli/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""whisper-remote CLI package."""
|
||||||
101
cli/src/whisper_remote_cli/main.py
Normal file
101
cli/src/whisper_remote_cli/main.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
SUPPORTED_FORMATS = ("txt", "vtt", "srt", "tsv", "json")
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(description="Send transcription jobs to a remote Whisper backend.")
|
||||||
|
parser.add_argument("file", type=Path, help="Path to the local media file to upload.")
|
||||||
|
parser.add_argument("--model", required=True, help="Whisper model name to use on the backend.")
|
||||||
|
parser.add_argument("--language", help="Optional language code to pass through to Whisper.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-format",
|
||||||
|
default="txt",
|
||||||
|
choices=SUPPORTED_FORMATS,
|
||||||
|
help="Transcript artifact format returned by the backend.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--server",
|
||||||
|
help="Override the backend base URL. Defaults to the WHISPER_REMOTE environment variable.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--to-file",
|
||||||
|
type=Path,
|
||||||
|
help="Optional local file path or directory to save the returned transcript artifact.",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_server(args: argparse.Namespace) -> str:
|
||||||
|
server = args.server or os.environ.get("WHISPER_REMOTE")
|
||||||
|
if not server:
|
||||||
|
raise SystemExit("WHISPER_REMOTE is not set and --server was not provided.")
|
||||||
|
return server.rstrip("/")
|
||||||
|
|
||||||
|
|
||||||
|
def infer_output_path(target: Path, input_file: Path, output_format: str) -> Path:
|
||||||
|
if target.exists() and target.is_dir():
|
||||||
|
return target / f"{input_file.stem}.{output_format}"
|
||||||
|
if target.suffix:
|
||||||
|
return target
|
||||||
|
return target / f"{input_file.stem}.{output_format}"
|
||||||
|
|
||||||
|
|
||||||
|
def print_response(response: httpx.Response) -> None:
|
||||||
|
sys.stdout.write(response.text)
|
||||||
|
if response.text and not response.text.endswith("\n"):
|
||||||
|
sys.stdout.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def save_response(response: httpx.Response, destination: Path) -> None:
|
||||||
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
destination.write_bytes(response.content)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
input_file = args.file.expanduser().resolve()
|
||||||
|
if not input_file.is_file():
|
||||||
|
parser.error(f"Input file does not exist: {input_file}")
|
||||||
|
|
||||||
|
server = resolve_server(args)
|
||||||
|
endpoint = f"{server}/transcriptions"
|
||||||
|
|
||||||
|
with input_file.open("rb") as handle, httpx.Client(timeout=300.0) as client:
|
||||||
|
response = client.post(
|
||||||
|
endpoint,
|
||||||
|
data={
|
||||||
|
"model": args.model,
|
||||||
|
"language": args.language or "",
|
||||||
|
"output_format": args.output_format,
|
||||||
|
},
|
||||||
|
files={"file": (input_file.name, handle, "application/octet-stream")},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response.raise_for_status()
|
||||||
|
except httpx.HTTPStatusError as exc:
|
||||||
|
message = exc.response.text.strip() or str(exc)
|
||||||
|
parser.exit(1, f"{message}\n")
|
||||||
|
|
||||||
|
if args.to_file:
|
||||||
|
destination = infer_output_path(args.to_file.expanduser(), input_file, args.output_format)
|
||||||
|
save_response(response, destination)
|
||||||
|
sys.stdout.write(f"{destination}\n")
|
||||||
|
else:
|
||||||
|
print_response(response)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
11
cli/tests/conftest.py
Normal file
11
cli/tests/conftest.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parents[1]
|
||||||
|
SRC = ROOT / "src"
|
||||||
|
|
||||||
|
if str(SRC) not in sys.path:
|
||||||
|
sys.path.insert(0, str(SRC))
|
||||||
28
cli/tests/test_main.py
Normal file
28
cli/tests/test_main.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import os
|
||||||
|
from argparse import Namespace
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from whisper_remote_cli import main
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_server_from_env(monkeypatch) -> None:
|
||||||
|
monkeypatch.setenv("WHISPER_REMOTE", "http://localhost:8000/")
|
||||||
|
assert main.resolve_server(Namespace(server=None)) == "http://localhost:8000"
|
||||||
|
|
||||||
|
|
||||||
|
def test_resolve_server_requires_value(monkeypatch) -> None:
|
||||||
|
monkeypatch.delenv("WHISPER_REMOTE", raising=False)
|
||||||
|
with pytest.raises(SystemExit):
|
||||||
|
main.resolve_server(Namespace(server=None))
|
||||||
|
|
||||||
|
|
||||||
|
def test_infer_output_path_for_directory(tmp_path: Path) -> None:
|
||||||
|
destination = main.infer_output_path(tmp_path, Path("clip.wav"), "srt")
|
||||||
|
assert destination == tmp_path / "clip.srt"
|
||||||
|
|
||||||
|
|
||||||
|
def test_infer_output_path_for_explicit_file(tmp_path: Path) -> None:
|
||||||
|
destination = main.infer_output_path(tmp_path / "custom-name.txt", Path("clip.wav"), "txt")
|
||||||
|
assert destination == tmp_path / "custom-name.txt"
|
||||||
Reference in New Issue
Block a user