From 709331dd3287cced1025d620df5773baeb524f6c Mon Sep 17 00:00:00 2001 From: Dev Aggarwal Date: Wed, 14 Feb 2024 01:31:30 +0530 Subject: [PATCH] fix youtube links in doc extract --- daras_ai/image_input.py | 4 +++- recipes/DocExtract.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/daras_ai/image_input.py b/daras_ai/image_input.py index 2a7a70417..0d9530f22 100644 --- a/daras_ai/image_input.py +++ b/daras_ai/image_input.py @@ -112,7 +112,9 @@ def safe_filename(filename: str) -> str: return out -def truncate_filename(text: str, maxlen: int = 100, sep: str = "...") -> str: +def truncate_filename( + text: str | bytes, maxlen: int = 100, sep: str | bytes = "..." +) -> str | bytes: if len(text) <= maxlen: return text assert len(sep) <= maxlen diff --git a/recipes/DocExtract.py b/recipes/DocExtract.py index 6b0728646..9cca97806 100644 --- a/recipes/DocExtract.py +++ b/recipes/DocExtract.py @@ -390,9 +390,11 @@ def process_source( ) content_url = existing_values[Columns.content_url.value] - is_pdf = "application/pdf" in doc_meta.mime_type is_yt = is_yt_url(webpage_url) - is_video = "video/" in doc_meta.mime_type or "audio/" in doc_meta.mime_type + is_pdf = doc_meta and "application/pdf" in doc_meta.mime_type + is_video = doc_meta and ( + "video/" in doc_meta.mime_type or "audio/" in doc_meta.mime_type + ) if not content_url: yield "Downloading" if is_yt: