From 26cb7786255bc1fb595746827b4137bbf26be18e Mon Sep 17 00:00:00 2001 From: Leonardo Diegues Date: Sat, 17 Feb 2024 17:34:10 -0300 Subject: [PATCH 1/4] Add skip for short audio chunks in OpenAIWhisperParser --- .../langchain_community/document_loaders/parsers/audio.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py index 65674e3d1dacd..c8c2d8b57eeda 100644 --- a/libs/community/langchain_community/document_loaders/parsers/audio.py +++ b/libs/community/langchain_community/document_loaders/parsers/audio.py @@ -52,11 +52,15 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # Need to meet 25MB size limit for Whisper API chunk_duration = 20 chunk_duration_ms = chunk_duration * 60 * 1000 + chunk_duration_threshold = 0.1 # Split the audio into chunk_duration_ms chunks for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)): # Audio chunk chunk = audio[i : i + chunk_duration_ms] + # Skip chunks that are too short to transcribe + if chunk.duration_seconds <= chunk_duration_threshold: + continue file_obj = io.BytesIO(chunk.export(format="mp3").read()) if blob.source is not None: file_obj.name = blob.source + f"_part_{split_number}.mp3" From 244359285565a3ad12c54fbeaa08a6bcd544c546 Mon Sep 17 00:00:00 2001 From: Leonardo Diegues Date: Tue, 20 Feb 2024 06:19:32 -0300 Subject: [PATCH 2/4] Refactor OpenAIWhisperParser constructor to include chunk_duration_threshold parameter --- .../document_loaders/parsers/audio.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py index c8c2d8b57eeda..85c8bc7b1b7d8 100644 --- a/libs/community/langchain_community/document_loaders/parsers/audio.py +++ b/libs/community/langchain_community/document_loaders/parsers/audio.py @@ -13,10 +13,21 @@ class OpenAIWhisperParser(BaseBlobParser): """Transcribe and parse audio files. - Audio transcription is with OpenAI Whisper model.""" + Audio transcription is with OpenAI Whisper model. + + Parameters: + api_key - OpenAI API key + chunk_duration_threshold - minimum duration of a chunk in seconds + NOTE: According to the OpenAI API, the chunk duration should be at least 0.1 + seconds. If the chunk duration is less or equal than the threshold, + it will be skipped. + """ - def __init__(self, api_key: Optional[str] = None): + def __init__( + self, api_key: Optional[str] = None, chunk_duration_threshold: float = 0.1 + ): self.api_key = api_key + self.chunk_duration_threshold = chunk_duration_threshold def lazy_parse(self, blob: Blob) -> Iterator[Document]: """Lazily parse the blob.""" @@ -52,14 +63,13 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # Need to meet 25MB size limit for Whisper API chunk_duration = 20 chunk_duration_ms = chunk_duration * 60 * 1000 - chunk_duration_threshold = 0.1 # Split the audio into chunk_duration_ms chunks for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)): # Audio chunk chunk = audio[i : i + chunk_duration_ms] # Skip chunks that are too short to transcribe - if chunk.duration_seconds <= chunk_duration_threshold: + if chunk.duration_seconds <= self.chunk_duration_threshold: continue file_obj = io.BytesIO(chunk.export(format="mp3").read()) if blob.source is not None: From 15be07f99f93e19aa22a64996d392d896bd715b7 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 20 Feb 2024 11:20:06 -0800 Subject: [PATCH 3/4] fmt --- .../langchain_community/document_loaders/parsers/audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py index 85c8bc7b1b7d8..431ddbc7dfc06 100644 --- a/libs/community/langchain_community/document_loaders/parsers/audio.py +++ b/libs/community/langchain_community/document_loaders/parsers/audio.py @@ -24,7 +24,7 @@ class OpenAIWhisperParser(BaseBlobParser): """ def __init__( - self, api_key: Optional[str] = None, chunk_duration_threshold: float = 0.1 + self, api_key: Optional[str] = None, *, chunk_duration_threshold: float = 0.1 ): self.api_key = api_key self.chunk_duration_threshold = chunk_duration_threshold From a6c2d1060d5f8e41b2aef2b1b158bc06d43dc3cc Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 20 Feb 2024 11:21:25 -0800 Subject: [PATCH 4/4] fmt --- .../document_loaders/parsers/audio.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py index 431ddbc7dfc06..77d1b2b8e121a 100644 --- a/libs/community/langchain_community/document_loaders/parsers/audio.py +++ b/libs/community/langchain_community/document_loaders/parsers/audio.py @@ -13,14 +13,15 @@ class OpenAIWhisperParser(BaseBlobParser): """Transcribe and parse audio files. + Audio transcription is with OpenAI Whisper model. - Parameters: - api_key - OpenAI API key - chunk_duration_threshold - minimum duration of a chunk in seconds - NOTE: According to the OpenAI API, the chunk duration should be at least 0.1 - seconds. If the chunk duration is less or equal than the threshold, - it will be skipped. + Args: + api_key: OpenAI API key + chunk_duration_threshold: minimum duration of a chunk in seconds + NOTE: According to the OpenAI API, the chunk duration should be at least 0.1 + seconds. If the chunk duration is less or equal than the threshold, + it will be skipped. """ def __init__(