From 4b9d55e08911a0527974ec6b602cbaef2bd41363 Mon Sep 17 00:00:00 2001 From: Fedir Zadniprovskyi Date: Tue, 7 Jan 2025 19:37:45 -0800 Subject: [PATCH] docs: init --- .pre-commit-config.yaml | 2 +- docs/configuration.md | 22 ++++++ docs/index.md | 1 - docs/installation.md | 101 ++++++++++++++++++++++++++++ docs/introduction.md | 32 +++++++++ docs/usage.md | 86 +++++++++++++++++++++++ mkdocs.yml | 5 +- src/faster_whisper_server/config.py | 31 +++++++-- 8 files changed, 270 insertions(+), 10 deletions(-) create mode 100644 docs/configuration.md delete mode 100644 docs/index.md create mode 100644 docs/installation.md create mode 100644 docs/introduction.md create mode 100644 docs/usage.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 95d4bae..b84d930 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,4 +44,4 @@ repos: rev: v1.5.0 hooks: - id: detect-secrets - exclude: 'README.md|tests/conftest.py' + exclude: 'README.md|tests/conftest.py|docs/usage.md' diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 0000000..7a13851 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,22 @@ + +::: faster_whisper_server.config.Config + options: + show_bases: true + show_if_no_docstring: true + show_labels: false + separate_signature: true + show_signature_annotations: true + signature_crossrefs: true + summary: false + source: true + members_order: source + filters: + - "!model_config" + - "!chat_completion_*" + - "!speech_*" + - "!transcription_*" + +::: faster_whisper_server.config.WhisperConfig + + + diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 684e27f..0000000 --- a/docs/index.md +++ /dev/null @@ -1 +0,0 @@ -Coming soon... diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..3af4ec2 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,101 @@ +## Docker Compose (Recommended) + +TODO: just reference the existing compose file in the repo +=== "CUDA" + + ```yaml + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html + services: + faster-whisper-server: + image: fedirz/faster-whisper-server:latest-cuda + name: faster-whisper-server + restart: unless-stopped + ports: + - 8000:8000 + volumes: + - hugging_face_cache:/root/.cache/huggingface + deploy: + resources: + reservations: + devices: + - capabilities: ["gpu"] + volumes: + hugging_face_cache: + ``` + +=== "CUDA (with CDI feature enabled)" + + ```yaml + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html + services: + faster-whisper-server: + image: fedirz/faster-whisper-server:latest-cuda + name: faster-whisper-server + restart: unless-stopped + ports: + - 8000:8000 + volumes: + - hugging_face_cache:/root/.cache/huggingface + deploy: + resources: + reservations: + # https://docs.docker.com/reference/cli/dockerd/#enable-cdi-devices + # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html + devices: + - driver: cdi + device_ids: + - nvidia.com/gpu=all + volumes: + hugging_face_cache: + ``` + +=== "CPU" + + ```yaml + services: + faster-whisper-server: + image: fedirz/faster-whisper-server:latest-cpu + name: faster-whisper-server + restart: unless-stopped + ports: + - 8000:8000 + volumes: + - hugging_face_cache:/root/.cache/huggingface + volumes: + hugging_face_cache: + ``` + +## Docker + +=== "CUDA" + + ```bash + docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface --gpus=all fedirz/faster-whisper-server:latest-cuda + ``` + +=== "CUDA (with CDI feature enabled)" + + ```bash + docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface --device=nvidia.com/gpu=all fedirz/faster-whisper-server:latest-cuda + ``` + +=== "CPU" + + ```bash + docker run --rm --detach --publish 8000:8000 --name faster-whisper-server --volume hugging_face_cache:/root/.cache/huggingface fedirz/faster-whisper-server:latest-cpu + ``` + +## Kubernetes +WARNING: it was written few months ago and may be outdated. +Please refer to this [blog post](https://substratus.ai/blog/deploying-faster-whisper-on-k8s) + +## Python (requires Python 3.12+) + +```bash +git clone https://github.com/fedirz/faster-whisper-server.git +cd faster-whisper-server +uv venv +sourve .venv/bin/activate +uv sync --all-extras +uvicorn --factory --host 0.0.0.0 faster_whisper_server.main:create_app +``` diff --git a/docs/introduction.md b/docs/introduction.md new file mode 100644 index 0000000..fa1217e --- /dev/null +++ b/docs/introduction.md @@ -0,0 +1,32 @@ +!!! warning + + Under development. I don't yet recommend using these docs as reference for now. + +# Faster Whisper Server + +`faster-whisper-server` is an OpenAI API-compatible transcription server which uses [faster-whisper](https://github.com/SYSTRAN/faster-whisper) as its backend. +Features: + +- GPU and CPU support. +- Easily deployable using Docker. +- **Configurable through environment variables (see [config.py](./src/faster_whisper_server/config.py))**. +- OpenAI API compatible. +- Streaming support (transcription is sent via [SSE](https://en.wikipedia.org/wiki/Server-sent_events) as the audio is transcribed. You don't need to wait for the audio to fully be transcribed before receiving it). +- Live transcription support (audio is sent via websocket as it's generated). +- Dynamic model loading / offloading. Just specify which model you want to use in the request and it will be loaded automatically. It will then be unloaded after a period of inactivity. + +Please create an issue if you find a bug, have a question, or a feature suggestion. + +## OpenAI API Compatibility ++ + +See [OpenAI API reference](https://platform.openai.com/docs/api-reference/audio) for more information. + +- Audio file transcription via `POST /v1/audio/transcriptions` endpoint. + - Unlike OpenAI's API, `faster-whisper-server` also supports streaming transcriptions (and translations). This is useful for when you want to process large audio files and would rather receive the transcription in chunks as they are processed, rather than waiting for the whole file to be transcribed. It works similarly to chat messages when chatting with LLMs. +- Audio file translation via `POST /v1/audio/translations` endpoint. +- Live audio transcription via `WS /v1/audio/transcriptions` endpoint. + - LocalAgreement2 ([paper](https://aclanthology.org/2023.ijcnlp-demo.3.pdf) | [original implementation](https://github.com/ufal/whisper_streaming)) algorithm is used for live transcription. + - Only transcription of a single channel, 16000 sample rate, raw, 16-bit little-endian audio is supported. + +TODO: add a note about gradio ui +TODO: add a note about hf space diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 0000000..641eec4 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,86 @@ +TODO: break this down into: transcription/translation, streaming transcription/translation, live transcription, audio generation, model listing +TODO: add video demos for all +TODO: add a note about OPENAI_API_KEY + +## Curl + +```bash +curl http://localhost:8000/v1/audio/transcriptions -F "file=@audio.wav" +``` + +## Python + +=== "httpx" + + ```python + import httpx + + with open('audio.wav', 'rb') as f: + files = {'file': ('audio.wav', f)} + response = httpx.post('http://localhost:8000/v1/audio/transcriptions', files=files) + + print(response.text) + ``` + +## OpenAI SDKs + +=== "Python" + + ```python + import httpx + + with open('audio.wav', 'rb') as f: + files = {'file': ('audio.wav', f)} + response = httpx.post('http://localhost:8000/v1/audio/transcriptions', files=files) + + print(response.text) + ``` + +=== "CLI" + + ```bash + export OPENAI_BASE_URL=http://localhost:8000/v1/ + export OPENAI_API_KEY="cant-be-empty" + openai api audio.transcriptions.create -m Systran/faster-whisper-small -f audio.wav --response-format text + ``` + +=== "Other" + + See [OpenAI libraries](https://platform.openai.com/docs/libraries) and [OpenAI speech-to-text usage](https://platform.openai.com/docs/guides/speech-to-text). + +## Open WebUI + +### Using the UI + +1. Go to the [Admin Settings](http://localhost:8080/admin/settings) page +2. Click on the "Audio" tab +3. Update settings + - Speech-to-Text Engine: OpenAI + - API Base URL: http://faster-whisper-server:8000/v1 + - API Key: does-not-matter-what-you-put-but-should-not-be-empty + - Model: Systran/faster-distil-whisper-large-v3 +4. Click "Save" + +### Using environment variables (Docker Compose) + +!!! warning + + This doesn't seem to work when you've previously used the UI to set the STT engine. + +```yaml +# NOTE: Some parts of the file are omitted for brevity. +services: + open-webui: + image: ghcr.io/open-webui/open-webui:main + ... + environment: + ... + # Environment variables are documented here https://docs.openwebui.com/getting-started/env-configuration#speech-to-text + AUDIO_STT_ENGINE: "openai" + AUDIO_STT_OPENAI_API_BASE_URL: "http://faster-whisper-server:8000/v1" + AUDIO_STT_OPENAI_API_KEY: "does-not-matter-what-you-put-but-should-not-be-empty" + AUDIO_STT_MODEL: "Systran/faster-distil-whisper-large-v3" + faster-whisper-server: + image: fedirz/faster-whisper-server:latest-cuda + ... +``` diff --git a/mkdocs.yml b/mkdocs.yml index dff45cd..9a8bf6a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,7 +20,10 @@ plugins: - mkdocstrings: default_handler: python nav: - - Home: index.md + - Introduction: introduction.md + - Installation: installation.md + - Configuration: configuration.md + - Usage: usage.md markdown_extensions: - admonition - pymdownx.superfences diff --git a/src/faster_whisper_server/config.py b/src/faster_whisper_server/config.py index 9e462d5..c06a164 100644 --- a/src/faster_whisper_server/config.py +++ b/src/faster_whisper_server/config.py @@ -38,6 +38,7 @@ class Quantization(enum.StrEnum): DEFAULT = "default" +# TODO: this needs to be rethought class Language(enum.StrEnum): AF = "af" AM = "am" @@ -151,7 +152,7 @@ class WhisperConfig(BaseModel): model: str = Field(default="Systran/faster-whisper-small") """ - Default Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2. + Default HuggingFace model to use for transcription. Note, the model must support being ran using CTranslate2. This model will be used if no model is specified in the request. Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran @@ -174,6 +175,7 @@ class WhisperConfig(BaseModel): """ # noqa: E501 +# TODO: document `alias` behaviour within the docstring class Config(BaseSettings): """Configuration for the application. Values can be set via environment variables. @@ -185,7 +187,13 @@ class Config(BaseSettings): model_config = SettingsConfigDict(env_nested_delimiter="__") api_key: str | None = None + """ + If set, the API key will be required for all requests. + """ log_level: str = "debug" + """ + Logging level. One of: 'debug', 'info', 'warning', 'error', 'critical'. + """ host: str = Field(alias="UVICORN_HOST", default="0.0.0.0") port: int = Field(alias="UVICORN_PORT", default=8000) allow_origins: list[str] | None = None @@ -198,8 +206,8 @@ class Config(BaseSettings): enable_ui: bool = True """ - Whether to enable the Gradio UI. You may want to disable this if you want to minimize the dependencies. - """ + Whether to enable the Gradio UI. You may want to disable this if you want to minimize the dependencies and slightly improve the startup time. + """ # noqa: E501 default_language: Language | None = None """ @@ -216,26 +224,35 @@ class Config(BaseSettings): ], ) """ - List of models to preload on startup. By default, the model is first loaded on first request. + List of Whisper models to preload on startup. By default, the model is first loaded on first request. + WARNING: I'd recommend not setting this, as it may be deprecated in the future. """ max_no_data_seconds: float = 1.0 """ Max duration to wait for the next audio chunk before transcription is finilized and connection is closed. + Used only for live transcription (WS /v1/audio/transcriptions). """ min_duration: float = 1.0 """ Minimum duration of an audio chunk that will be transcribed. + Used only for live transcription (WS /v1/audio/transcriptions). """ word_timestamp_error_margin: float = 0.2 + """ + Used only for live transcription (WS /v1/audio/transcriptions). + """ max_inactivity_seconds: float = 2.5 """ Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed. + Used only for live transcription (WS /v1/audio/transcriptions). """ # noqa: E501 inactivity_window_seconds: float = 5.0 """ - Controls how many latest seconds of audio are being passed through VAD. - Should be greater than `max_inactivity_seconds` - """ + Controls how many latest seconds of audio are being passed through VAD. Should be greater than `max_inactivity_seconds`. + Used only for live transcription (WS /v1/audio/transcriptions). + """ # noqa: E501 + + # NOTE: options below are not used yet and should be ignored. Added as a placeholder for future features I'm currently working on. # noqa: E501 chat_completion_base_url: str = "https://api.openai.com/v1" chat_completion_api_key: str | None = None