From 26cb7786255bc1fb595746827b4137bbf26be18e Mon Sep 17 00:00:00 2001 From: Leonardo Diegues Date: Sat, 17 Feb 2024 17:34:10 -0300 Subject: [PATCH] Add skip for short audio chunks in OpenAIWhisperParser --- .../langchain_community/document_loaders/parsers/audio.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py index 65674e3d1dacd..c8c2d8b57eeda 100644 --- a/libs/community/langchain_community/document_loaders/parsers/audio.py +++ b/libs/community/langchain_community/document_loaders/parsers/audio.py @@ -52,11 +52,15 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # Need to meet 25MB size limit for Whisper API chunk_duration = 20 chunk_duration_ms = chunk_duration * 60 * 1000 + chunk_duration_threshold = 0.1 # Split the audio into chunk_duration_ms chunks for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)): # Audio chunk chunk = audio[i : i + chunk_duration_ms] + # Skip chunks that are too short to transcribe + if chunk.duration_seconds <= chunk_duration_threshold: + continue file_obj = io.BytesIO(chunk.export(format="mp3").read()) if blob.source is not None: file_obj.name = blob.source + f"_part_{split_number}.mp3"