Skip to content

Commit

Permalink
community[patch]: youtube loader transcript format (#16625)
Browse files Browse the repository at this point in the history
- **Description**: YoutubeLoader right now returns one document that
contains the entire transcript. I think it would be useful to add an
option to return multiple documents, where each document would contain
one line of transcript with the start time and duration in the metadata.
For example,
[AssemblyAIAudioTranscriptLoader](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/assemblyai.py)
is implemented in a similar way, it allows you to choose between the
format to use for the document loader.
  • Loading branch information
sydneyidler authored Jan 26, 2024
1 parent a936472 commit 4e189cd
Showing 1 changed file with 21 additions and 3 deletions.
24 changes: 21 additions & 3 deletions libs/community/langchain_community/document_loaders/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import logging
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse
Expand Down Expand Up @@ -139,6 +140,11 @@ def _parse_video_id(url: str) -> Optional[str]:
return video_id


class TranscriptFormat(Enum):
TEXT = "text"
LINES = "lines"


class YoutubeLoader(BaseLoader):
"""Load `YouTube` transcripts."""

Expand All @@ -148,6 +154,7 @@ def __init__(
add_video_info: bool = False,
language: Union[str, Sequence[str]] = "en",
translation: Optional[str] = None,
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
continue_on_failure: bool = False,
):
"""Initialize with YouTube video ID."""
Expand All @@ -159,6 +166,7 @@ def __init__(
else:
self.language = language
self.translation = translation
self.transcript_format = transcript_format
self.continue_on_failure = continue_on_failure

@staticmethod
Expand Down Expand Up @@ -214,9 +222,19 @@ def load(self) -> List[Document]:

transcript_pieces = transcript.fetch()

transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])

return [Document(page_content=transcript, metadata=metadata)]
if self.transcript_format == TranscriptFormat.TEXT:
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
return [Document(page_content=transcript, metadata=metadata)]
elif self.transcript_format == TranscriptFormat.LINES:
return [
Document(
page_content=t["text"].strip(" "),
metadata=dict((key, t[key]) for key in t if key != "text"),
)
for t in transcript_pieces
]
else:
raise ValueError("Unknown transcript format.")

def _get_video_info(self) -> dict:
"""Get important video information.
Expand Down

0 comments on commit 4e189cd

Please sign in to comment.