Merge pull request #147 from NotJoeMartinez/use_yt-dlp_package

using yt-dlp package instead of subprocess
NotJoeMartinez · Jun 26, 2024 · e6b1f8c · e6b1f8c
2 parents 8411277 + 0f446f6
commit e6b1f8c
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 90 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,8 @@ dependencies = [
     "requests==2.31.0",
     "rich==13.7.1",
     "sqlite-utils==3.36",
-    "beautifulsoup4==4.12.3"
+    "beautifulsoup4==4.12.3",
+    "yt-dlp==2024.5.27"
 ]
 
 [project.scripts]

diff --git a/requirements.txt b/requirements.txt
@@ -4,4 +4,5 @@ chromadb==0.5.2
 requests==2.31.0
 rich==13.7.1
 sqlite-utils==3.36
-beautifulsoup4==4.12.3
+beautifulsoup4==4.12.3
+yt-dlp==2024.5.27
diff --git a/yt_fts/download.py b/yt_fts/download.py
@@ -1,6 +1,11 @@
+import yt_dlp
 import tempfile
-import subprocess, re, os, sqlite3, json
+import re
+import os
+import sqlite3
+import json
 
+from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
@@ -12,6 +17,7 @@
 
 from rich.progress import track
 from rich.console import Console
+console = Console()
 
 def handle_reject_consent_cookie(channel_url, s):
     """
@@ -82,80 +88,60 @@ def get_videos_list(channel_url):
     console = Console()
 
     with console.status("[bold green]Scraping video urls, this might take a little...") as status:
-        cmd = [
-            "yt-dlp",
-            "--flat-playlist",
-            "--print",
-            "id",
-            f"{channel_url}"
-        ]
-        res = subprocess.run(cmd, capture_output=True, check=True)
-        list_of_videos_urls = res.stdout.decode().splitlines()
-
-        streams_url = channel_url.replace("/videos", "/streams") 
-        cmd = [
-            "yt-dlp",
-            "--flat-playlist",
-            "--print",
-            "id",
-            streams_url
-        ]
-        try:
-            res = subprocess.run(cmd, capture_output=True, check=True)
-            live_stream_urls = res.stdout.decode().splitlines()
-            if len(live_stream_urls) > 0:
-                list_of_videos_urls.extend(live_stream_urls)
-        except subprocess.CalledProcessError:
-            console.print("[bold red]No streams tab found or error fetching streams.")
+        ydl_opts = {
+            'extract_flat': True,
+            'quiet': True,
+        }
+
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(channel_url, download=False)
+            list_of_videos_urls = [entry['id'] for entry in info['entries']]
 
+        streams_url = channel_url.replace("/videos", "/streams")
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                streams_info = ydl.extract_info(streams_url, download=False)
+                live_stream_urls = [entry['id'] for entry in streams_info['entries']]
+                if len(live_stream_urls) > 0:
+                    list_of_videos_urls.extend(live_stream_urls)
+        except Exception:
+            console.print("[bold red]No streams found")
 
-    return list_of_videos_urls 
+    return list_of_videos_urls
 
 
 def get_playlist_data(playlist_url):
     """
     Returns a list of channel ids and video ids from a playlist
-    [
-        ['channel_id', 'video_id'],
-    ]
     """
-
     console = Console()
 
     with console.status("[bold green]Scraping video urls, this might take a little...") as status:
-        cmd = [
-            "yt-dlp",
-            "--print",
-            "%(channel)s,%(channel_id)s,%(id)s",
-            f"{playlist_url}"
-        ]
-        res = subprocess.run(cmd, capture_output=True, check=True)
-        data = res.stdout.decode().splitlines()
-
-        playlist_data = []
-
-        for vid in data:
-            vid = vid.split(',')
-            vid_obj = {
-                'channel_name': vid[0],
-                'channel_id': vid[1],
-                'video_id': vid[2],
-                'channel_url': f"https://www.youtube.com/channel/{vid[1]}/videos",
-                'video_url': f"https://youtu.be/{vid[2]}"
-            }
-            playlist_data.append(vid_obj)
-
-    return playlist_data 
-
-
-def download_vtts(number_of_jobs, video_ids, language ,tmp_dir):
+        ydl_opts = {
+            'quiet': True,
+            'extract_flat': True,
+        }
+
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(playlist_url, download=False)
+            playlist_data = []
+            for entry in info['entries']:
+                vid_obj = {
+                    'channel_name': entry['channel'],
+                    'channel_id': entry['channel_id'],
+                    'video_id': entry['id'],
+                    'channel_url': f"https://www.youtube.com/channel/{entry['channel_id']}/videos",
+                    'video_url': f"https://youtu.be/{entry['id']}"
+                }
+                playlist_data.append(vid_obj)
+
+    return playlist_data
+
+
+def download_vtts(number_of_jobs, video_ids, language, tmp_dir):
     """
     Multi-threaded download of vtt files
     """
-
-    # showing progress on a multi-threaded task might be more trouble than it's worth
-    # console = Console()
-
     executor = ThreadPoolExecutor(number_of_jobs)
     futures = []
 
@@ -168,17 +154,26 @@ def download_vtts(number_of_jobs, video_ids, language ,tmp_dir):
         futures[i].result()
 
 
+def quiet_progress_hook(d):
+    if d['status'] == 'finished':
+        filename = Path(d['filename']).name
+        print(f" -> {filename}")
+
+
 def get_vtt(tmp_dir, video_url, language):
-    subprocess.run([
-        "yt-dlp",
-        "-o", f"{tmp_dir}/%(id)s",
-        "--write-info-json",
-        "--write-auto-sub",
-        "--convert-subs", "vtt",
-        "--skip-download",
-        "--sub-langs", f"{language},-live_chat",
-        video_url
-    ])
+    ydl_opts = {
+        'outtmpl': f'{tmp_dir}/%(id)s',
+        'writeinfojson': True,
+        'writeautomaticsub': True,
+        'subtitlesformat': 'vtt',
+        'skip_download': True,
+        'subtitleslangs': [language, '-live_chat'],
+        'quiet': True,
+        'progress_hooks': [quiet_progress_hook]
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([video_url])
 
 
 def vtt_to_db(dir_path):
@@ -187,21 +182,11 @@ def vtt_to_db(dir_path):
     the vtt parsing function, then inserts the data into the database.
     """
     items = os.listdir(dir_path)
-    file_paths = []
-
-    for item in items:
-        # ignore other files e.g. info.json files
-        if not item.endswith('.vtt'):
-            continue
-
-        item_path = os.path.join(dir_path, item)
-        if os.path.isfile(item_path):
-            file_paths.append(item_path)    
+    file_paths = [os.path.join(dir_path, item) for item in items if item.endswith('.vtt')]
 
     con = sqlite3.connect(get_db_path())  
     cur = con.cursor()
 
-
     for vtt in track(file_paths, description="Adding subtitles to database..."):
         base_name = os.path.basename(vtt)
 
@@ -213,7 +198,7 @@ def vtt_to_db(dir_path):
         with open(vid_json_path, 'r', encoding='utf-8', errors='ignore') as f:
             vid_json = json.load(f)
 
-        vid_title =  vid_json['title']
+        vid_title = vid_json['title']
         vid_date = get_date(vid_json['upload_date'])
         channel_id = vid_json['channel_id']
 

diff --git a/yt_fts/yt_fts.py b/yt_fts/yt_fts.py
@@ -1,4 +1,5 @@
 import click
+import sys
 import requests
 
 from .config import get_config_path, get_db_path, get_or_make_chroma_path 
@@ -10,6 +11,7 @@
 from rich.console import Console
 
 YT_FTS_VERSION = "0.1.49"
+console = Console()
 
 @click.group(context_settings={"help_option_names": ["-h", "--help"]})
 @click.version_option(YT_FTS_VERSION, message='yt_fts version: %(version)s')
@@ -32,13 +34,16 @@ def cli():
 @click.option("-j", "--number-of-jobs", type=int, default=1, help="Optional number of jobs to parallelize the run")
 def download(url, playlist, language, number_of_jobs):
 
-    console = Console()
     s = requests.session()
     handle_reject_consent_cookie(url, s)
 
     if playlist == True:
+        if "playlist?" not in url:
+            console.print(f"\n[bold red]Error:[/bold red] Invalid playlist url {url}")
+            print("\nYouTube playlists have this format: https://www.youtube.com/playlist?list=<playlist_id>\n")
+            sys.exit(1)
         download_playlist(url, s, language, number_of_jobs)
-        return
+        sys.exit(0)
 
     # find out if the channel exists on the internet 
     with console.status("[bold green]Getting Channel ID...") as status:
@@ -141,9 +146,9 @@ def delete(channel):
     channel_name = get_channel_name_from_id(channel_id) 
     channel_url = f"https://www.youtube.com/channel/{channel_id}/videos"
 
-    print(f"Deleting channel {channel_name}: {channel_url}")
-    print("Are you sure you want to delete this channel and all its data?")
-    confirm = input("y/n: ")
+    console.print(f"Deleting channel [bold]\"{channel_name}\"[/bold]: {channel_url}")
+    console.print("[bold]Are you sure you want to delete this channel and all its data?[/bold]")
+    confirm = input("(Y/n): ")
 
     if confirm == "y":
         delete_channel(channel_id)