community[patch]: youtube loader transcript format (#16625)

- **Description**: YoutubeLoader right now returns one document that contains the entire transcript. I think it would be useful to add an option to return multiple documents, where each document would contain one line of transcript with the start time and duration in the metadata. For example, [AssemblyAIAudioTranscriptLoader](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/assemblyai.py) is implemented in a similar way, it allows you to choose between the format to use for the document loader.
langchain-ai · Jan 26, 2024 · 4e189cd · 4e189cd
1 parent a936472
commit 4e189cd
Showing 1 changed file with 21 additions and 3 deletions.
diff --git a/libs/community/langchain_community/document_loaders/youtube.py b/libs/community/langchain_community/document_loaders/youtube.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import logging
+from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Union
 from urllib.parse import parse_qs, urlparse
@@ -139,6 +140,11 @@ def _parse_video_id(url: str) -> Optional[str]:
     return video_id
 
 
+class TranscriptFormat(Enum):
+    TEXT = "text"
+    LINES = "lines"
+
+
 class YoutubeLoader(BaseLoader):
     """Load `YouTube` transcripts."""
 
@@ -148,6 +154,7 @@ def __init__(
         add_video_info: bool = False,
         language: Union[str, Sequence[str]] = "en",
         translation: Optional[str] = None,
+        transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
         continue_on_failure: bool = False,
     ):
         """Initialize with YouTube video ID."""
@@ -159,6 +166,7 @@ def __init__(
         else:
             self.language = language
         self.translation = translation
+        self.transcript_format = transcript_format
         self.continue_on_failure = continue_on_failure
 
     @staticmethod
@@ -214,9 +222,19 @@ def load(self) -> List[Document]:
 
         transcript_pieces = transcript.fetch()
 
-        transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
-
-        return [Document(page_content=transcript, metadata=metadata)]
+        if self.transcript_format == TranscriptFormat.TEXT:
+            transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
+            return [Document(page_content=transcript, metadata=metadata)]
+        elif self.transcript_format == TranscriptFormat.LINES:
+            return [
+                Document(
+                    page_content=t["text"].strip(" "),
+                    metadata=dict((key, t[key]) for key in t if key != "text"),
+                )
+                for t in transcript_pieces
+            ]
+        else:
+            raise ValueError("Unknown transcript format.")
 
     def _get_video_info(self) -> dict:
         """Get important video information.