From b91b32569f9ac8f1abfe00fdccfbfccd232dffb1 Mon Sep 17 00:00:00 2001 From: milovate Date: Mon, 23 Dec 2024 15:14:14 +0530 Subject: [PATCH] refactor: docs_export_mimetype as top level constant --- daras_ai_v2/gdrive_downloader.py | 21 ++++++++++++--------- daras_ai_v2/vector_search.py | 6 +++--- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index d7b6aba1c..19e68fa2a 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -1,5 +1,5 @@ import io - +import typing from furl import furl import requests @@ -7,6 +7,13 @@ from daras_ai_v2.functional import flatmap_parallel from daras_ai_v2.exceptions import raise_for_status +docs_export_mimetype = { + "application/vnd.google-apps.document": "text/plain", + "application/vnd.google-apps.spreadsheet": "text/csv", + "application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.google-apps.drawing": "application/pdf", +} + def is_gdrive_url(f: furl) -> bool: return f.host in ["drive.google.com", "docs.google.com"] @@ -63,23 +70,19 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str def gdrive_download( - f: furl, mime_type: str, export_links: dict = {} + f: furl, mime_type: str, export_links: typing.Optional[dict] = None ) -> tuple[bytes, str]: from googleapiclient import discovery from googleapiclient.http import MediaIoBaseDownload + if export_links is None: + export_links = {} + # get drive file id file_id = url_to_gdrive_file_id(f) # get metadata service = discovery.build("drive", "v3") - docs_export_mimetype = { - "application/vnd.google-apps.document": "text/plain", - "application/vnd.google-apps.spreadsheet": "text/csv", - "application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.google-apps.drawing": "application/pdf", - } - if f.host != "drive.google.com": # export google docs to appropriate type export_mime_type = docs_export_mimetype.get(mime_type, mime_type) diff --git a/daras_ai_v2/vector_search.py b/daras_ai_v2/vector_search.py index 3d54d5383..302e5cda9 100644 --- a/daras_ai_v2/vector_search.py +++ b/daras_ai_v2/vector_search.py @@ -310,7 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: etag = meta.get("md5Checksum") or meta.get("modifiedTime") mime_type = meta["mimeType"] total_bytes = int(meta.get("size") or 0) - export_links = meta.get("exportLinks", {}) + export_links = meta.get("exportLinks", None) else: try: if is_user_uploaded_url(f_url): @@ -328,7 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: mime_type = None etag = None total_bytes = 0 - export_links = {} + export_links = None else: name = ( r.headers.get("content-disposition", "") @@ -340,7 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: etag = etag.strip('"') mime_type = get_mimetype_from_response(r) total_bytes = int(r.headers.get("content-length") or 0) - export_links = {} + export_links = None # extract filename from url as a fallback if not name: if is_user_uploaded_url(f_url):