Skip to content

Commit

Permalink
feat(parser, audio): added audio-parser RemoteFasterWhisperPerser in …
Browse files Browse the repository at this point in the history
…audio.py
  • Loading branch information
lfenzo committed Sep 19, 2024
1 parent 4e0a6eb commit 581115a
Showing 1 changed file with 131 additions and 3 deletions.
134 changes: 131 additions & 3 deletions libs/community/langchain_community/document_loaders/parsers/audio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import io
import json
import logging
import os
import subprocess
import time
from typing import Any, Dict, Iterator, Literal, Optional, Tuple, Union

Expand Down Expand Up @@ -435,9 +438,6 @@ def __init__(

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""

import io

try:
from pydub import AudioSegment
except ImportError:
Expand Down Expand Up @@ -483,3 +483,131 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
**blob.metadata,
},
)


class RemoteFasterWhisperParser(BaseBlobParser):
"""
Remotely transcribe audio with faster-whisper-server
This parser enables audio transcription by interacting with a remote
`faster-whisper-server`, an OpenAI API-compatible transcription server that uses
`faster-whisper` as its backend. By sending audio data to the remote server, this
parser eliminates the need for GPU dependencies and heavy processing within the
LangChain container, making it ideal for microservice-based architectures.
For more information visit: https://github.com/fedirz/faster-whisper-server
Example: Load a local audio file and remotely transcribe it into a document.
.. code-block:: python
from langchain.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers.audio import RemoteFasterWhisperParser
server_url = 'http://localhost:8000' # add you server url here
loader = GenericLoader.from_filesystem(
path='./my-audio-file.mp3',
parser=RemoteFasterWhisperParser(base_url=server_url),
)
print(loader.load())
"""
TRANSCRIPTION_ENDPOINT = "v1/audio/transcriptions"

def __init__(
self,
*,
base_url: str,
model_size: Optional[str] = 'Systran/faster-distil-whisper-large-v3',
) -> None:
"""
Initialize the RemoteFasterWhisperParser.
Args:
base_url (str): The base URL of the remote Faster Whisper server.
model_size (Optional[str]): The model size to use for transcription.
Defaults to 'Systran/faster-distil-whisper-large-v3'.
"""
self.base_url = base_url
self.model_size = model_size

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""
Lazily parse the audio blob and yield transcribed text.
Args:
blob (Blob): The audio blob to be transcribed.
Yields:
Document: A Document object containing the transcribed text and associated metadata.
"""
audio_bytes = self._load_audio_from_blob(blob=blob)
transcription = self._transcribe_audio(file_bytes=audio_bytes)
yield Document(
page_content=transcription['text'],
metadata={
"source": blob.source,
**blob.metadata,
},
)

def _transcribe_audio(self, file_bytes: bytes) -> dict[str, str]:
"""
Transcribe the audio file by sending it to the remote Faster Whisper server.
Args:
file_bytes (bytes): The audio file data in bytes format.
Returns:
dict[str, str]: A dictionary containing the transcription result.
Raises:
RuntimeError: If the transcription process fails.
"""
url = self._get_transcription_url()
process = subprocess.Popen(
['curl', url, '-F', 'file=@-;type=audio/mp3', '-F', f'model={self.model_size}'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)

stdout, stderr = process.communicate(input=file_bytes.read())

if process.returncode != 0:
raise RuntimeError(f'Error: {stderr.decode()}')

return json.loads(stdout.decode())

def _get_transcription_url(self) -> str:
return f"{self.base_url}/{self.TRANSCRIPTION_ENDPOINT}"

def _load_audio_from_blob(self, blob: Blob) -> bytes:
"""
Load the audio data from the given blob and convert it to MP3 format.
Args:
blob (Blob): The audio blob containing the data to be converted.
Returns:
bytes: The audio data in MP3 format.
Raises:
ImportError: If the `pydub` package is not installed.
ValueError: If the audio data cannot be extracted from the blob.
"""
try:
from pydub import AudioSegment
except ImportError:
raise ImportError(
"pydub package not found, please install it with `pip install pydub`"
)

if isinstance(blob.data, bytes):
audio = AudioSegment.from_file(io.BytesIO(blob.data))
elif blob.data is None and blob.path:
audio = AudioSegment.from_file(blob.path)
else:
raise ValueError("Unable to get audio from blob")

return io.BytesIO(audio.export(format="mp3").read())

0 comments on commit 581115a

Please sign in to comment.