From 3452169712b59d1034ba7da498bafaa20731f40f Mon Sep 17 00:00:00 2001 From: Aqib Ansari Date: Mon, 16 Dec 2024 03:44:38 +0530 Subject: [PATCH] Fix YoutubeLoader to use yt-dlp for metadata fetching --- .../document_loaders/youtube.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/youtube.py b/libs/community/langchain_community/document_loaders/youtube.py index 67c1569adf8d6..121f59decc628 100644 --- a/libs/community/langchain_community/document_loaders/youtube.py +++ b/libs/community/langchain_community/document_loaders/youtube.py @@ -312,24 +312,35 @@ def _get_video_info(self) -> Dict: - and more. """ try: - from pytube import YouTube + from yt_dlp import YoutubeDL except ImportError: raise ImportError( - 'Could not import "pytube" Python package. ' - "Please install it with `pip install pytube`." + 'Could not import "yt_dlp" Python package. ' + "Please install it with `pip install yt_dlp`." ) - yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}") + ydl_opts = {"quiet": True, "no_warnings": True, "skip_download": True} + with YoutubeDL(ydl_opts) as ydl: + yt = ydl.extract_info( + f"https://www.youtube.com/watch?v={self.video_id}", download=False + ) + publish_date = yt.get("upload_date") + if publish_date: + try: + from datetime import datetime + + publish_date = datetime.strptime(publish_date, "%Y%m%d") + except (ValueError, TypeError): + publish_date = "Unknown" video_info = { - "title": yt.title or "Unknown", - "description": yt.description or "Unknown", - "view_count": yt.views or 0, - "thumbnail_url": yt.thumbnail_url or "Unknown", - "publish_date": yt.publish_date.strftime("%Y-%m-%d %H:%M:%S") - if yt.publish_date - else "Unknown", - "length": yt.length or 0, - "author": yt.author or "Unknown", + "title": yt.get("title", "Unknown"), + "description": yt.get("description", "Unknown"), + "view_count": yt.get("view_count", 0), + "publish_date": publish_date, + "length": yt.get("duration", 0), + "author": yt.get("uploader", "Unknown"), + "channel_id": yt.get("channel_id", "Unknown"), + "webpage_url": yt.get("webpage_url", "Unknown"), } return video_info