Skip to content

Commit

Permalink
Merge pull request #147 from NotJoeMartinez/use_yt-dlp_package
Browse files Browse the repository at this point in the history
using yt-dlp package instead of subprocess
  • Loading branch information
NotJoeMartinez authored Jun 26, 2024
2 parents 8411277 + 0f446f6 commit e6b1f8c
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 90 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ dependencies = [
"requests==2.31.0",
"rich==13.7.1",
"sqlite-utils==3.36",
"beautifulsoup4==4.12.3"
"beautifulsoup4==4.12.3",
"yt-dlp==2024.5.27"
]

[project.scripts]
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ chromadb==0.5.2
requests==2.31.0
rich==13.7.1
sqlite-utils==3.36
beautifulsoup4==4.12.3
beautifulsoup4==4.12.3
yt-dlp==2024.5.27
151 changes: 68 additions & 83 deletions yt_fts/download.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import yt_dlp
import tempfile
import subprocess, re, os, sqlite3, json
import re
import os
import sqlite3
import json

from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from urllib.parse import urlparse
Expand All @@ -12,6 +17,7 @@

from rich.progress import track
from rich.console import Console
console = Console()

def handle_reject_consent_cookie(channel_url, s):
"""
Expand Down Expand Up @@ -82,80 +88,60 @@ def get_videos_list(channel_url):
console = Console()

with console.status("[bold green]Scraping video urls, this might take a little...") as status:
cmd = [
"yt-dlp",
"--flat-playlist",
"--print",
"id",
f"{channel_url}"
]
res = subprocess.run(cmd, capture_output=True, check=True)
list_of_videos_urls = res.stdout.decode().splitlines()

streams_url = channel_url.replace("/videos", "/streams")
cmd = [
"yt-dlp",
"--flat-playlist",
"--print",
"id",
streams_url
]
try:
res = subprocess.run(cmd, capture_output=True, check=True)
live_stream_urls = res.stdout.decode().splitlines()
if len(live_stream_urls) > 0:
list_of_videos_urls.extend(live_stream_urls)
except subprocess.CalledProcessError:
console.print("[bold red]No streams tab found or error fetching streams.")
ydl_opts = {
'extract_flat': True,
'quiet': True,
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(channel_url, download=False)
list_of_videos_urls = [entry['id'] for entry in info['entries']]

streams_url = channel_url.replace("/videos", "/streams")
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
streams_info = ydl.extract_info(streams_url, download=False)
live_stream_urls = [entry['id'] for entry in streams_info['entries']]
if len(live_stream_urls) > 0:
list_of_videos_urls.extend(live_stream_urls)
except Exception:
console.print("[bold red]No streams found")

return list_of_videos_urls
return list_of_videos_urls


def get_playlist_data(playlist_url):
"""
Returns a list of channel ids and video ids from a playlist
[
['channel_id', 'video_id'],
]
"""

console = Console()

with console.status("[bold green]Scraping video urls, this might take a little...") as status:
cmd = [
"yt-dlp",
"--print",
"%(channel)s,%(channel_id)s,%(id)s",
f"{playlist_url}"
]
res = subprocess.run(cmd, capture_output=True, check=True)
data = res.stdout.decode().splitlines()

playlist_data = []

for vid in data:
vid = vid.split(',')
vid_obj = {
'channel_name': vid[0],
'channel_id': vid[1],
'video_id': vid[2],
'channel_url': f"https://www.youtube.com/channel/{vid[1]}/videos",
'video_url': f"https://youtu.be/{vid[2]}"
}
playlist_data.append(vid_obj)

return playlist_data


def download_vtts(number_of_jobs, video_ids, language ,tmp_dir):
ydl_opts = {
'quiet': True,
'extract_flat': True,
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(playlist_url, download=False)
playlist_data = []
for entry in info['entries']:
vid_obj = {
'channel_name': entry['channel'],
'channel_id': entry['channel_id'],
'video_id': entry['id'],
'channel_url': f"https://www.youtube.com/channel/{entry['channel_id']}/videos",
'video_url': f"https://youtu.be/{entry['id']}"
}
playlist_data.append(vid_obj)

return playlist_data


def download_vtts(number_of_jobs, video_ids, language, tmp_dir):
"""
Multi-threaded download of vtt files
"""

# showing progress on a multi-threaded task might be more trouble than it's worth
# console = Console()

executor = ThreadPoolExecutor(number_of_jobs)
futures = []

Expand All @@ -168,17 +154,26 @@ def download_vtts(number_of_jobs, video_ids, language ,tmp_dir):
futures[i].result()


def quiet_progress_hook(d):
if d['status'] == 'finished':
filename = Path(d['filename']).name
print(f" -> {filename}")


def get_vtt(tmp_dir, video_url, language):
subprocess.run([
"yt-dlp",
"-o", f"{tmp_dir}/%(id)s",
"--write-info-json",
"--write-auto-sub",
"--convert-subs", "vtt",
"--skip-download",
"--sub-langs", f"{language},-live_chat",
video_url
])
ydl_opts = {
'outtmpl': f'{tmp_dir}/%(id)s',
'writeinfojson': True,
'writeautomaticsub': True,
'subtitlesformat': 'vtt',
'skip_download': True,
'subtitleslangs': [language, '-live_chat'],
'quiet': True,
'progress_hooks': [quiet_progress_hook]
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])


def vtt_to_db(dir_path):
Expand All @@ -187,21 +182,11 @@ def vtt_to_db(dir_path):
the vtt parsing function, then inserts the data into the database.
"""
items = os.listdir(dir_path)
file_paths = []

for item in items:
# ignore other files e.g. info.json files
if not item.endswith('.vtt'):
continue

item_path = os.path.join(dir_path, item)
if os.path.isfile(item_path):
file_paths.append(item_path)
file_paths = [os.path.join(dir_path, item) for item in items if item.endswith('.vtt')]

con = sqlite3.connect(get_db_path())
cur = con.cursor()


for vtt in track(file_paths, description="Adding subtitles to database..."):
base_name = os.path.basename(vtt)

Expand All @@ -213,7 +198,7 @@ def vtt_to_db(dir_path):
with open(vid_json_path, 'r', encoding='utf-8', errors='ignore') as f:
vid_json = json.load(f)

vid_title = vid_json['title']
vid_title = vid_json['title']
vid_date = get_date(vid_json['upload_date'])
channel_id = vid_json['channel_id']

Expand Down
15 changes: 10 additions & 5 deletions yt_fts/yt_fts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import click
import sys
import requests

from .config import get_config_path, get_db_path, get_or_make_chroma_path
Expand All @@ -10,6 +11,7 @@
from rich.console import Console

YT_FTS_VERSION = "0.1.49"
console = Console()

@click.group(context_settings={"help_option_names": ["-h", "--help"]})
@click.version_option(YT_FTS_VERSION, message='yt_fts version: %(version)s')
Expand All @@ -32,13 +34,16 @@ def cli():
@click.option("-j", "--number-of-jobs", type=int, default=1, help="Optional number of jobs to parallelize the run")
def download(url, playlist, language, number_of_jobs):

console = Console()
s = requests.session()
handle_reject_consent_cookie(url, s)

if playlist == True:
if "playlist?" not in url:
console.print(f"\n[bold red]Error:[/bold red] Invalid playlist url {url}")
print("\nYouTube playlists have this format: https://www.youtube.com/playlist?list=<playlist_id>\n")
sys.exit(1)
download_playlist(url, s, language, number_of_jobs)
return
sys.exit(0)

# find out if the channel exists on the internet
with console.status("[bold green]Getting Channel ID...") as status:
Expand Down Expand Up @@ -141,9 +146,9 @@ def delete(channel):
channel_name = get_channel_name_from_id(channel_id)
channel_url = f"https://www.youtube.com/channel/{channel_id}/videos"

print(f"Deleting channel {channel_name}: {channel_url}")
print("Are you sure you want to delete this channel and all its data?")
confirm = input("y/n: ")
console.print(f"Deleting channel [bold]\"{channel_name}\"[/bold]: {channel_url}")
console.print("[bold]Are you sure you want to delete this channel and all its data?[/bold]")
confirm = input("(Y/n): ")

if confirm == "y":
delete_channel(channel_id)
Expand Down

0 comments on commit e6b1f8c

Please sign in to comment.