diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 2720b19a9..3f18f7e1d 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -1,9 +1,11 @@ import io from furl import furl +import requests from daras_ai_v2.exceptions import UserError from daras_ai_v2.functional import flatmap_parallel +from daras_ai_v2.exceptions import raise_for_status def is_gdrive_url(f: furl) -> bool: @@ -60,7 +62,7 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str return filter(None, urls) -def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: +def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]: from googleapiclient import discovery from googleapiclient.http import MediaIoBaseDownload @@ -68,19 +70,20 @@ def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: file_id = url_to_gdrive_file_id(f) # get metadata service = discovery.build("drive", "v3") - # get files in drive directly - if f.host == "drive.google.com": - request = service.files().get_media( - fileId=file_id, - supportsAllDrives=True, - ) - # export google docs to appropriate type - else: - mime_type, _ = docs_export_mimetype(f) - request = service.files().export_media( - fileId=file_id, - mimeType=mime_type, - ) + + if f.host != "drive.google.com": + # export google docs to appropriate type + export_mime_type, _ = docs_export_mimetype(f) + if f_url_export := export_links.get(export_mime_type, None): + r = requests.get(f_url_export) + file_bytes = r.content + raise_for_status(r, is_user_url=True) + return file_bytes, export_mime_type + + request = service.files().get_media( + fileId=file_id, + supportsAllDrives=True, + ) # download file = io.BytesIO() downloader = MediaIoBaseDownload(file, request) @@ -88,8 +91,9 @@ def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]: while done is False: _, done = downloader.next_chunk() # print(f"Download {int(status.progress() * 100)}%") - f_bytes = file.getvalue() - return f_bytes, mime_type + file_bytes = file.getvalue() + + return file_bytes, mime_type def docs_export_mimetype(f: furl) -> tuple[str, str]: @@ -109,8 +113,10 @@ def docs_export_mimetype(f: furl) -> tuple[str, str]: mime_type = "text/csv" ext = ".csv" elif "presentation" in f.path.segments: - mime_type = "application/pdf" - ext = ".pdf" + mime_type = ( + "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + ext = ".pptx" elif "drawings" in f.path.segments: mime_type = "application/pdf" ext = ".pdf" @@ -128,7 +134,7 @@ def gdrive_metadata(file_id: str) -> dict: .get( supportsAllDrives=True, fileId=file_id, - fields="name,md5Checksum,modifiedTime,mimeType,size", + fields="name,md5Checksum,modifiedTime,mimeType,size,exportLinks", ) .execute() ) diff --git a/daras_ai_v2/glossary.py b/daras_ai_v2/glossary.py index 87618b87c..77c252173 100644 --- a/daras_ai_v2/glossary.py +++ b/daras_ai_v2/glossary.py @@ -15,7 +15,7 @@ def validate_glossary_document(document: str): metadata = doc_url_to_file_metadata(document) f_bytes, mime_type = download_content_bytes( - f_url=document, mime_type=metadata.mime_type + f_url=document, mime_type=metadata.mime_type, export_links=metadata.export_links ) df = tabular_bytes_to_str_df( f_name=metadata.name, f_bytes=f_bytes, mime_type=mime_type diff --git a/daras_ai_v2/office_utils_pptx.py b/daras_ai_v2/office_utils_pptx.py index e45843e9a..780841424 100644 --- a/daras_ai_v2/office_utils_pptx.py +++ b/daras_ai_v2/office_utils_pptx.py @@ -34,7 +34,11 @@ def pptx_to_text_pages(f: typing.BinaryIO, use_form_reco: bool = False) -> list[ except Exception as e: slide_content.append(f" Error processing shape: {e}") + if slide.has_notes_slide: + slide_content.extend(handle_author_notes(slide)) + slides_text.append("\n".join(slide_content) + "\n") + return slides_text @@ -43,81 +47,55 @@ def handle_text_elements(shape) -> list[str]: Handles text elements within a shape, including lists. """ text_elements = [] - is_a_list = False - is_list_group_created = False - enum_list_item_value = 0 - bullet_type = "None" - list_label = "LIST" namespaces = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"} - # Identify if shape contains lists + current_list_type = None + list_item_index = 0 + for paragraph in shape.text_frame.paragraphs: p = paragraph._element + paragraph_text = "" + is_list_item = False + + # Determine list type if p.find(".//a:buChar", namespaces=namespaces) is not None: - bullet_type = "Bullet" - is_a_list = True + current_list_type = "Bullet" + is_list_item = True elif p.find(".//a:buAutoNum", namespaces=namespaces) is not None: - bullet_type = "Numbered" - is_a_list = True + current_list_type = "Numbered" + is_list_item = True + elif paragraph.level > 0: # Indented text is also treated as a list + current_list_type = "Bullet" + is_list_item = True else: - is_a_list = False - - if paragraph.level > 0: - is_a_list = True - - if is_a_list: - if bullet_type == "Numbered": - list_label = "ORDERED_LIST" - - # Iterate through paragraphs to build up text - for paragraph in shape.text_frame.paragraphs: - p = paragraph._element - enum_list_item_value += 1 - inline_paragraph_text = "" - inline_list_item_text = "" - doc_label = "PARAGRAPH" - - for e in p.iterfind(".//a:r", namespaces=namespaces): - if len(e.text.strip()) > 0: - e_is_a_list_item = False - is_numbered = False - if p.find(".//a:buChar", namespaces=namespaces) is not None: - bullet_type = "Bullet" - e_is_a_list_item = True - elif p.find(".//a:buAutoNum", namespaces=namespaces) is not None: - bullet_type = "Numbered" - is_numbered = True - e_is_a_list_item = True - else: - e_is_a_list_item = False - - if e_is_a_list_item: - if len(inline_paragraph_text) > 0: - text_elements.append(inline_paragraph_text) - inline_list_item_text += e.text + current_list_type = None + list_item_index = 0 # Reset numbering if no list + + # Process paragraph text + for run in p.iterfind(".//a:r", namespaces=namespaces): + run_text = run.text.strip() if run.text else "" + if run_text: + paragraph_text += run_text + + if is_list_item: + if current_list_type == "Numbered": + list_item_index += 1 + list_prefix = f"{list_item_index}." + else: + list_prefix = "•" # Default bullet symbol + text_elements.append(f"{list_prefix} {paragraph_text}") + else: + # Handle placeholders for titles or subtitles + if shape.is_placeholder: + placeholder_type = shape.placeholder_format.type + if placeholder_type == PP_PLACEHOLDER.TITLE: + text_elements.append(f"TITLE: {paragraph_text}") + elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: + text_elements.append(f"SECTION_HEADER: {paragraph_text}") else: - if shape.is_placeholder: - placeholder_type = shape.placeholder_format.type - if placeholder_type in [ - PP_PLACEHOLDER.CENTER_TITLE, - PP_PLACEHOLDER.TITLE, - ]: - doc_label = "TITLE" - elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: - doc_label = "SECTION_HEADER" - enum_list_item_value = 0 - inline_paragraph_text += e.text - - if len(inline_paragraph_text) > 0: - text_elements.append(inline_paragraph_text) - - if len(inline_list_item_text) > 0: - enum_marker = "" - if is_numbered: - enum_marker = str(enum_list_item_value) + "." - if not is_list_group_created: - is_list_group_created = True - text_elements.append(f"{enum_marker} {inline_list_item_text}") + text_elements.append(paragraph_text) + else: + text_elements.append(paragraph_text) return text_elements @@ -171,7 +149,7 @@ def handle_tables(shape) -> list[str]: for row in grid[1:]: line = "|" + "|".join(row) + "|" table_text.append(line) - print(line) + # print(line) return table_text @@ -207,6 +185,17 @@ def handle_charts(shape) -> list[str]: return chart_text +def handle_author_notes(slide) -> list[str]: + + notes = [] + if slide.notes_slide.notes_text_frame: + notes_text = slide.notes_slide.notes_text_frame.text.strip() + if notes_text: + notes.append("Speaker Notes:") + notes.append(notes_text) + return notes + + # TODO :azure form reco to extract text from images def handle_pictures(shape): pass diff --git a/daras_ai_v2/vector_search.py b/daras_ai_v2/vector_search.py index f78c39260..3d54d5383 100644 --- a/daras_ai_v2/vector_search.py +++ b/daras_ai_v2/vector_search.py @@ -310,6 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: etag = meta.get("md5Checksum") or meta.get("modifiedTime") mime_type = meta["mimeType"] total_bytes = int(meta.get("size") or 0) + export_links = meta.get("exportLinks", {}) else: try: if is_user_uploaded_url(f_url): @@ -327,6 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: mime_type = None etag = None total_bytes = 0 + export_links = {} else: name = ( r.headers.get("content-disposition", "") @@ -338,6 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: etag = etag.strip('"') mime_type = get_mimetype_from_response(r) total_bytes = int(r.headers.get("content-length") or 0) + export_links = {} # extract filename from url as a fallback if not name: if is_user_uploaded_url(f_url): @@ -347,9 +350,12 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: # guess mimetype from name as a fallback if not mime_type: mime_type = mimetypes.guess_type(name)[0] - return FileMetadata( + + file_metadata = FileMetadata( name=name, etag=etag, mime_type=mime_type or "", total_bytes=total_bytes ) + file_metadata.export_links = export_links or {} + return file_metadata def yt_dlp_get_video_entries(url: str) -> list[dict]: @@ -650,7 +656,10 @@ def doc_url_to_text_pages( Download document from url and convert to text pages. """ f_bytes, mime_type = download_content_bytes( - f_url=f_url, mime_type=file_meta.mime_type, is_user_url=is_user_url + f_url=f_url, + mime_type=file_meta.mime_type, + is_user_url=is_user_url, + export_links=file_meta.export_links, ) if not f_bytes: return [] @@ -664,14 +673,18 @@ def doc_url_to_text_pages( def download_content_bytes( - *, f_url: str, mime_type: str, is_user_url: bool = True + *, + f_url: str, + mime_type: str, + is_user_url: bool = True, + export_links: dict[str, str] = {}, ) -> tuple[bytes, str]: if is_yt_dlp_able_url(f_url): return download_youtube_to_wav(f_url), "audio/wav" f = furl(f_url) if is_gdrive_url(f): # download from google drive - return gdrive_download(f, mime_type) + return gdrive_download(f, mime_type, export_links) try: # download from url if is_user_uploaded_url(f_url): diff --git a/files/models.py b/files/models.py index 12af91a7a..afb6504cc 100644 --- a/files/models.py +++ b/files/models.py @@ -8,6 +8,10 @@ class FileMetadata(models.Model): mime_type = models.CharField(max_length=255, default="", blank=True) total_bytes = models.PositiveIntegerField(default=0, blank=True) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.export_links = {} + def __str__(self): ret = f"{self.name or 'Unnamed'} - {self.mime_type}" if self.total_bytes: diff --git a/recipes/BulkRunner.py b/recipes/BulkRunner.py index 25c99a2dc..4eb7d654e 100644 --- a/recipes/BulkRunner.py +++ b/recipes/BulkRunner.py @@ -612,7 +612,7 @@ def get_columns(files: list[str]) -> list[str]: def read_df_any(f_url: str) -> "pd.DataFrame": file_meta = doc_url_to_file_metadata(f_url) f_bytes, mime_type = download_content_bytes( - f_url=f_url, mime_type=file_meta.mime_type + f_url=f_url, mime_type=file_meta.mime_type, export_links=file_meta.export_links ) df = tabular_bytes_to_any_df( f_name=file_meta.name, f_bytes=f_bytes, mime_type=mime_type diff --git a/recipes/DocExtract.py b/recipes/DocExtract.py index 23cf89bfe..0fa063379 100644 --- a/recipes/DocExtract.py +++ b/recipes/DocExtract.py @@ -475,7 +475,9 @@ def process_source( elif is_video: f = furl(webpage_url) if is_gdrive_url(f): - f_bytes, _ = gdrive_download(f, doc_meta.mime_type) + f_bytes, _ = gdrive_download( + f, doc_meta.mime_type, doc_meta.export_links + ) webpage_url = upload_file_from_bytes( doc_meta.name, f_bytes, content_type=doc_meta.mime_type )