From dcea38bd0ced574bee4c73ddd971094f8453ad1d Mon Sep 17 00:00:00 2001 From: milovate Date: Fri, 20 Dec 2024 18:56:33 +0530 Subject: [PATCH 1/5] fix: export - fallbacks to get_media --- daras_ai_v2/gdrive_downloader.py | 58 ++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 3f18f7e1d..7dd20a262 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -1,5 +1,5 @@ import io - +import mimetypes from furl import furl import requests @@ -73,8 +73,13 @@ def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, if f.host != "drive.google.com": # export google docs to appropriate type - export_mime_type, _ = docs_export_mimetype(f) - if f_url_export := export_links.get(export_mime_type, None): + export_mime_type, _, is_google_workspace_doc = docs_export_mimetype( + f, mime_type + ) + + if is_google_workspace_doc and ( + f_url_export := export_links.get(export_mime_type, None) + ): r = requests.get(f_url_export) file_bytes = r.content raise_for_status(r, is_user_url=True) @@ -96,7 +101,7 @@ def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, return file_bytes, mime_type -def docs_export_mimetype(f: furl) -> tuple[str, str]: +def docs_export_mimetype(f: furl, mime_type) -> tuple[str, str, bool]: """ return the mimetype to export google docs - https://developers.google.com/drive/api/guides/ref-export-formats @@ -104,25 +109,36 @@ def docs_export_mimetype(f: furl) -> tuple[str, str]: f (furl): google docs link Returns: - tuple[str, str]: (mime_type, extension) + tuple[str, str]: (mime_type, extension, is_google_workspace_supported) """ - if "document" in f.path.segments: - mime_type = "text/plain" - ext = ".txt" - elif "spreadsheets" in f.path.segments: - mime_type = "text/csv" - ext = ".csv" - elif "presentation" in f.path.segments: - mime_type = ( - "application/vnd.openxmlformats-officedocument.presentationml.presentation" - ) - ext = ".pptx" - elif "drawings" in f.path.segments: - mime_type = "application/pdf" - ext = ".pdf" + + supported_mimetypes = { + "application/vnd.google-apps.spreadsheet", + "application/vnd.google-apps.presentation", + "application/vnd.google-apps.drawing", + "application/vnd.google-apps.document", + } + + is_google_workspace_supported = mime_type in supported_mimetypes + + if is_google_workspace_supported: + if "document" in f.path.segments: + mime_type = "text/plain" + ext = ".txt" + elif "spreadsheets" in f.path.segments: + mime_type = "text/csv" + ext = ".csv" + elif "presentation" in f.path.segments: + mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ext = ".pptx" + elif "drawings" in f.path.segments: + mime_type = "application/pdf" + ext = ".pdf" + else: + raise ValueError(f"Not sure how to export google docs url: {str(f)!r}") else: - raise ValueError(f"Not sure how to export google docs url: {str(f)!r}") - return mime_type, ext + ext = f".{mimetypes.guess_extension(mime_type)}" or "" + return mime_type, ext, is_google_workspace_supported def gdrive_metadata(file_id: str) -> dict: From 6b041e5c2aa46e239d39c2ba829e99896aa65c4f Mon Sep 17 00:00:00 2001 From: milovate Date: Fri, 20 Dec 2024 20:27:39 +0530 Subject: [PATCH 2/5] refactor: simplify gdrive_download --- daras_ai_v2/gdrive_downloader.py | 58 ++++++-------------------------- 1 file changed, 10 insertions(+), 48 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 7dd20a262..6544008ae 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -1,5 +1,5 @@ import io -import mimetypes + from furl import furl import requests @@ -71,15 +71,17 @@ def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, # get metadata service = discovery.build("drive", "v3") + docs_export_mimetype = { + "application/vnd.google-apps.document": "text/plain", + "application/vnd.google-apps.spreadsheet": "text/csv", + "application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.google-apps.drawing": "application/pdf", + } + if f.host != "drive.google.com": # export google docs to appropriate type - export_mime_type, _, is_google_workspace_doc = docs_export_mimetype( - f, mime_type - ) - - if is_google_workspace_doc and ( - f_url_export := export_links.get(export_mime_type, None) - ): + export_mime_type = docs_export_mimetype.get(mime_type, mime_type) + if f_url_export := export_links.get(export_mime_type, None): r = requests.get(f_url_export) file_bytes = r.content raise_for_status(r, is_user_url=True) @@ -101,46 +103,6 @@ def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, return file_bytes, mime_type -def docs_export_mimetype(f: furl, mime_type) -> tuple[str, str, bool]: - """ - return the mimetype to export google docs - https://developers.google.com/drive/api/guides/ref-export-formats - - Args: - f (furl): google docs link - - Returns: - tuple[str, str]: (mime_type, extension, is_google_workspace_supported) - """ - - supported_mimetypes = { - "application/vnd.google-apps.spreadsheet", - "application/vnd.google-apps.presentation", - "application/vnd.google-apps.drawing", - "application/vnd.google-apps.document", - } - - is_google_workspace_supported = mime_type in supported_mimetypes - - if is_google_workspace_supported: - if "document" in f.path.segments: - mime_type = "text/plain" - ext = ".txt" - elif "spreadsheets" in f.path.segments: - mime_type = "text/csv" - ext = ".csv" - elif "presentation" in f.path.segments: - mime_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation" - ext = ".pptx" - elif "drawings" in f.path.segments: - mime_type = "application/pdf" - ext = ".pdf" - else: - raise ValueError(f"Not sure how to export google docs url: {str(f)!r}") - else: - ext = f".{mimetypes.guess_extension(mime_type)}" or "" - return mime_type, ext, is_google_workspace_supported - - def gdrive_metadata(file_id: str) -> dict: from googleapiclient import discovery From 717246563b726ea34c13fc3b844e4ea1b8888ce5 Mon Sep 17 00:00:00 2001 From: milovate Date: Sat, 21 Dec 2024 21:40:08 +0530 Subject: [PATCH 3/5] fix: add default value for export_links in gdrive_download --- daras_ai_v2/gdrive_downloader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 6544008ae..d7b6aba1c 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -62,7 +62,9 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str return filter(None, urls) -def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]: +def gdrive_download( + f: furl, mime_type: str, export_links: dict = {} +) -> tuple[bytes, str]: from googleapiclient import discovery from googleapiclient.http import MediaIoBaseDownload From b91b32569f9ac8f1abfe00fdccfbfccd232dffb1 Mon Sep 17 00:00:00 2001 From: milovate Date: Mon, 23 Dec 2024 15:14:14 +0530 Subject: [PATCH 4/5] refactor: docs_export_mimetype as top level constant --- daras_ai_v2/gdrive_downloader.py | 21 ++++++++++++--------- daras_ai_v2/vector_search.py | 6 +++--- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index d7b6aba1c..19e68fa2a 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -1,5 +1,5 @@ import io - +import typing from furl import furl import requests @@ -7,6 +7,13 @@ from daras_ai_v2.functional import flatmap_parallel from daras_ai_v2.exceptions import raise_for_status +docs_export_mimetype = { + "application/vnd.google-apps.document": "text/plain", + "application/vnd.google-apps.spreadsheet": "text/csv", + "application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.google-apps.drawing": "application/pdf", +} + def is_gdrive_url(f: furl) -> bool: return f.host in ["drive.google.com", "docs.google.com"] @@ -63,23 +70,19 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str def gdrive_download( - f: furl, mime_type: str, export_links: dict = {} + f: furl, mime_type: str, export_links: typing.Optional[dict] = None ) -> tuple[bytes, str]: from googleapiclient import discovery from googleapiclient.http import MediaIoBaseDownload + if export_links is None: + export_links = {} + # get drive file id file_id = url_to_gdrive_file_id(f) # get metadata service = discovery.build("drive", "v3") - docs_export_mimetype = { - "application/vnd.google-apps.document": "text/plain", - "application/vnd.google-apps.spreadsheet": "text/csv", - "application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.google-apps.drawing": "application/pdf", - } - if f.host != "drive.google.com": # export google docs to appropriate type export_mime_type = docs_export_mimetype.get(mime_type, mime_type) diff --git a/daras_ai_v2/vector_search.py b/daras_ai_v2/vector_search.py index 3d54d5383..302e5cda9 100644 --- a/daras_ai_v2/vector_search.py +++ b/daras_ai_v2/vector_search.py @@ -310,7 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: etag = meta.get("md5Checksum") or meta.get("modifiedTime") mime_type = meta["mimeType"] total_bytes = int(meta.get("size") or 0) - export_links = meta.get("exportLinks", {}) + export_links = meta.get("exportLinks", None) else: try: if is_user_uploaded_url(f_url): @@ -328,7 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: mime_type = None etag = None total_bytes = 0 - export_links = {} + export_links = None else: name = ( r.headers.get("content-disposition", "") @@ -340,7 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata: etag = etag.strip('"') mime_type = get_mimetype_from_response(r) total_bytes = int(r.headers.get("content-length") or 0) - export_links = {} + export_links = None # extract filename from url as a fallback if not name: if is_user_uploaded_url(f_url): From 376e5d10be665955d7d9b0812cdcc95ad0442f89 Mon Sep 17 00:00:00 2001 From: milovate Date: Mon, 23 Dec 2024 16:16:39 +0530 Subject: [PATCH 5/5] fix: FileMetadata export_links type definition --- daras_ai_v2/gdrive_downloader.py | 4 ++-- files/models.py | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/daras_ai_v2/gdrive_downloader.py b/daras_ai_v2/gdrive_downloader.py index 19e68fa2a..ce1e23620 100644 --- a/daras_ai_v2/gdrive_downloader.py +++ b/daras_ai_v2/gdrive_downloader.py @@ -7,7 +7,7 @@ from daras_ai_v2.functional import flatmap_parallel from daras_ai_v2.exceptions import raise_for_status -docs_export_mimetype = { +DOCS_EXPORT_MIMETYPES = { "application/vnd.google-apps.document": "text/plain", "application/vnd.google-apps.spreadsheet": "text/csv", "application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation", @@ -85,7 +85,7 @@ def gdrive_download( if f.host != "drive.google.com": # export google docs to appropriate type - export_mime_type = docs_export_mimetype.get(mime_type, mime_type) + export_mime_type = DOCS_EXPORT_MIMETYPES.get(mime_type, mime_type) if f_url_export := export_links.get(export_mime_type, None): r = requests.get(f_url_export) file_bytes = r.content diff --git a/files/models.py b/files/models.py index afb6504cc..b03a598b3 100644 --- a/files/models.py +++ b/files/models.py @@ -7,10 +7,7 @@ class FileMetadata(models.Model): etag = models.CharField(max_length=255, null=True) mime_type = models.CharField(max_length=255, default="", blank=True) total_bytes = models.PositiveIntegerField(default=0, blank=True) - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.export_links = {} + export_links: dict[str, str] | None = None def __str__(self): ret = f"{self.name or 'Unnamed'} - {self.mime_type}"