From 32c65729576b855bb3e299cd71f1b2a13843f9e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Cauwelier?= Date: Fri, 24 May 2024 11:17:21 +0200 Subject: [PATCH] also store and export video (upload) date Allowing things like sorting a CSV export by date. Database schema is backwards incompatible and requires manual migration. --- yt_fts/db_utils.py | 20 +++++++++++++++----- yt_fts/download.py | 6 +++--- yt_fts/export.py | 15 +++++++++++---- yt_fts/search.py | 16 +++++++++------- yt_fts/utils.py | 8 ++++++++ 5 files changed, 46 insertions(+), 19 deletions(-) diff --git a/yt_fts/db_utils.py b/yt_fts/db_utils.py index 8d03b08..96b6d45 100644 --- a/yt_fts/db_utils.py +++ b/yt_fts/db_utils.py @@ -4,7 +4,7 @@ from rich.console import Console from rich.table import Table -from .utils import show_message +from .utils import show_message, get_date from .config import get_db_path @@ -25,7 +25,8 @@ def make_db(db_path): "video_id": str, "video_title": str, "video_url": str, - "channel_id": str + "channel_id": str, + "video_date": str, }, pk="video_id", not_null={"video_title", "video_url"}, @@ -78,7 +79,7 @@ def add_channel_info(channel_id, channel_name, channel_url): }) -def add_video(channel_id, video_id, video_title, video_url): +def add_video(channel_id, video_id, video_title, video_url, video_date): conn = sqlite3.connect(get_db_path()) cur = conn.cursor() @@ -86,8 +87,8 @@ def add_video(channel_id, video_id, video_title, video_url): (video_id,)).fetchone() if existing_video is None: - cur.execute("INSERT INTO Videos (video_id, video_title, video_url, channel_id) VALUES (?, ?, ?, ?)", - (video_id, video_title, video_url, channel_id)) + cur.execute("INSERT INTO Videos (video_id, video_title, video_url, video_date, channel_id) VALUES (?, ?, ?, ?, ?)", + (video_id, video_title, video_url, video_date, channel_id)) conn.commit() else: @@ -144,6 +145,15 @@ def get_title_from_db(video_id): return db.execute(f"SELECT video_title FROM Videos WHERE video_id = ?", [video_id]).fetchone()[0] +def get_metadata_from_db(video_id): + + db = Database(get_db_path()) + + metadata = db.execute_returning_dicts(f"SELECT * FROM Videos WHERE video_id = ?", [video_id])[0] + metadata["video_date"] = get_date(metadata["video_date"]) + return metadata + + def get_channel_name_from_id(channel_id): db = Database(get_db_path()) diff --git a/yt_fts/download.py b/yt_fts/download.py index a180bce..4124155 100644 --- a/yt_fts/download.py +++ b/yt_fts/download.py @@ -1,4 +1,3 @@ - import tempfile import subprocess, re, os, sqlite3, json @@ -8,7 +7,7 @@ from .config import get_db_path from .db_utils import add_video -from .utils import parse_vtt +from .utils import parse_vtt, get_date from urllib.parse import urlparse from rich.progress import track @@ -215,9 +214,10 @@ def vtt_to_db(dir_path): vid_json = json.load(f) vid_title = vid_json['title'] + vid_date = get_date(vid_json['upload_date']) channel_id = vid_json['channel_id'] - add_video(channel_id, vid_id, vid_title, vid_url) + add_video(channel_id, vid_id, vid_title, vid_url, vid_date) vtt_json = parse_vtt(vtt) diff --git a/yt_fts/export.py b/yt_fts/export.py index 2aa6476..935ea02 100644 --- a/yt_fts/export.py +++ b/yt_fts/export.py @@ -4,7 +4,7 @@ from .db_utils import ( search_channel, search_video, search_all, - get_channel_name_from_video_id, get_title_from_db + get_channel_name_from_video_id, get_metadata_from_db, ) from .utils import time_to_secs, show_message @@ -35,17 +35,24 @@ def export_fts(text, scope, channel_id=None, video_id=None): with open(file_name, 'w', newline='') as csvfile: writer = csv.writer(csvfile) - writer.writerow(['Channel Name','Video Title', 'Quote', 'Time Stamp', 'Link']) + writer.writerow(['Channel Name','Video Title', 'Date', 'Quote', 'Time Stamp', 'Link']) for quote in res: video_id = quote["video_id"] channel_name = get_channel_name_from_video_id(video_id) - video_title = get_title_from_db(video_id) + metadata = get_metadata_from_db(video_id) time_stamp = quote["start_time"] subs = quote["text"] time = time_to_secs(time_stamp) - writer.writerow([channel_name,video_title, subs.strip(), time_stamp, f"https://youtu.be/{video_id}?t={time}"]) + writer.writerow([ + channel_name, + metadata['video_title'], + metadata['video_date'], + subs.strip(), + time_stamp, + f"https://youtu.be/{video_id}?t={time}" + ]) console = Console() diff --git a/yt_fts/search.py b/yt_fts/search.py index dca5d98..23344ed 100644 --- a/yt_fts/search.py +++ b/yt_fts/search.py @@ -55,7 +55,7 @@ def print_fts_res(res, query): quote_match["channel_name"] = get_channel_name_from_video_id(video_id) channel_names.append(quote_match["channel_name"]) - quote_match["video_title"] = get_title_from_db(video_id) + quote_match["metadata"] = get_metadata_from_db(video_id) quote_match["subs"] = bold_query_matches(quote["text"].strip(), query) quote_match["time_stamp"] = time_stamp quote_match["video_id"] = video_id @@ -94,7 +94,9 @@ def print_fts_res(res, query): fts_dict = {} for quote in fts_res: channel_name = quote["channel_name"] - video_name = quote["video_title"] + metadata = quote["metadata"] + video_name = metadata["video_title"] + video_date = metadata["video_date"] quote_data = { "quote": quote["subs"], "time_stamp": quote["time_stamp"], @@ -102,9 +104,9 @@ def print_fts_res(res, query): } if channel_name not in fts_dict: fts_dict[channel_name] = {} - if video_name not in fts_dict[channel_name]: - fts_dict[channel_name][video_name] = [] - fts_dict[channel_name][video_name].append(quote_data) + if (video_name, video_date) not in fts_dict[channel_name]: + fts_dict[channel_name][(video_name, video_date)] = [] + fts_dict[channel_name][(video_name, video_date)].append(quote_data) # Sort the list by the total number of quotes in each channel @@ -119,8 +121,8 @@ def print_fts_res(res, query): video_list = list(videos.items()) video_list.sort(key=lambda x: len(x[1])) - for video_name, quotes in video_list: - console.print(f" [bold][blue]{video_name}[/blue][/bold]") + for (video_name, video_date), quotes in video_list: + console.print(f" [bold][blue]{video_name}[/blue][/bold] ({video_date})") console.print("") # Sort the quotes by timestamp diff --git a/yt_fts/utils.py b/yt_fts/utils.py index ca865b4..ee9b3e2 100644 --- a/yt_fts/utils.py +++ b/yt_fts/utils.py @@ -1,6 +1,7 @@ """ This is where I'm putting all the functions that don't belong anywhere else """ +import datetime import re import sqlite3 @@ -91,6 +92,13 @@ def get_time_delta(timestamp1, timestamp2): return diff +def get_date(date_string): + # Python 3.11 would support datimetime.date.fromisoformat('YYYYMMDD') directly + if '-' in date_string: + return datetime.date.fromisoformat(date_string) + return datetime.datetime.strptime(date_string, '%Y%m%d').date() + + # check if semantic search has been enabled for channel def check_ss_enabled(channel_id=None):