langchain-ai · baskaryan · Feb 23, 2024 · Feb 17, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py
@@ -13,10 +13,21 @@
 
 class OpenAIWhisperParser(BaseBlobParser):
     """Transcribe and parse audio files.
-    Audio transcription is with OpenAI Whisper model."""
+    Audio transcription is with OpenAI Whisper model.
+
+    Parameters:
+    api_key - OpenAI API key
+    chunk_duration_threshold - minimum duration of a chunk in seconds
+        NOTE: According to the OpenAI API, the chunk duration should be at least 0.1
+        seconds. If the chunk duration is less or equal than the threshold,
+        it will be skipped.
+    """
 
-    def __init__(self, api_key: Optional[str] = None):
+    def __init__(
+        self, api_key: Optional[str] = None, chunk_duration_threshold: float = 0.1
+    ):
         self.api_key = api_key
+        self.chunk_duration_threshold = chunk_duration_threshold
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:
         """Lazily parse the blob."""
@@ -57,6 +68,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
         for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):
             # Audio chunk
             chunk = audio[i : i + chunk_duration_ms]
+            # Skip chunks that are too short to transcribe
+            if chunk.duration_seconds <= self.chunk_duration_threshold:
+                continue
             file_obj = io.BytesIO(chunk.export(format="mp3").read())
             if blob.source is not None:
                 file_obj.name = blob.source + f"_part_{split_number}.mp3"