Skip to content

Commit

Permalink
Merge pull request #572 from GooeyAI/pptx-gdocs
Browse files Browse the repository at this point in the history
support all non exportable google workspace docs
  • Loading branch information
milovate authored Dec 23, 2024
2 parents f2662b7 + 376e5d1 commit 6e72eed
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 39 deletions.
47 changes: 15 additions & 32 deletions daras_ai_v2/gdrive_downloader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import io

import typing
from furl import furl
import requests

from daras_ai_v2.exceptions import UserError
from daras_ai_v2.functional import flatmap_parallel
from daras_ai_v2.exceptions import raise_for_status

DOCS_EXPORT_MIMETYPES = {
"application/vnd.google-apps.document": "text/plain",
"application/vnd.google-apps.spreadsheet": "text/csv",
"application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.google-apps.drawing": "application/pdf",
}


def is_gdrive_url(f: furl) -> bool:
return f.host in ["drive.google.com", "docs.google.com"]
Expand Down Expand Up @@ -62,18 +69,23 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str
return filter(None, urls)


def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]:
def gdrive_download(
f: furl, mime_type: str, export_links: typing.Optional[dict] = None
) -> tuple[bytes, str]:
from googleapiclient import discovery
from googleapiclient.http import MediaIoBaseDownload

if export_links is None:
export_links = {}

# get drive file id
file_id = url_to_gdrive_file_id(f)
# get metadata
service = discovery.build("drive", "v3")

if f.host != "drive.google.com":
# export google docs to appropriate type
export_mime_type, _ = docs_export_mimetype(f)
export_mime_type = DOCS_EXPORT_MIMETYPES.get(mime_type, mime_type)
if f_url_export := export_links.get(export_mime_type, None):
r = requests.get(f_url_export)
file_bytes = r.content
Expand All @@ -96,35 +108,6 @@ def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes,
return file_bytes, mime_type


def docs_export_mimetype(f: furl) -> tuple[str, str]:
"""
return the mimetype to export google docs - https://developers.google.com/drive/api/guides/ref-export-formats
Args:
f (furl): google docs link
Returns:
tuple[str, str]: (mime_type, extension)
"""
if "document" in f.path.segments:
mime_type = "text/plain"
ext = ".txt"
elif "spreadsheets" in f.path.segments:
mime_type = "text/csv"
ext = ".csv"
elif "presentation" in f.path.segments:
mime_type = (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
ext = ".pptx"
elif "drawings" in f.path.segments:
mime_type = "application/pdf"
ext = ".pdf"
else:
raise ValueError(f"Not sure how to export google docs url: {str(f)!r}")
return mime_type, ext


def gdrive_metadata(file_id: str) -> dict:
from googleapiclient import discovery

Expand Down
6 changes: 3 additions & 3 deletions daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = meta.get("md5Checksum") or meta.get("modifiedTime")
mime_type = meta["mimeType"]
total_bytes = int(meta.get("size") or 0)
export_links = meta.get("exportLinks", {})
export_links = meta.get("exportLinks", None)
else:
try:
if is_user_uploaded_url(f_url):
Expand All @@ -328,7 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
mime_type = None
etag = None
total_bytes = 0
export_links = {}
export_links = None
else:
name = (
r.headers.get("content-disposition", "")
Expand All @@ -340,7 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = etag.strip('"')
mime_type = get_mimetype_from_response(r)
total_bytes = int(r.headers.get("content-length") or 0)
export_links = {}
export_links = None
# extract filename from url as a fallback
if not name:
if is_user_uploaded_url(f_url):
Expand Down
5 changes: 1 addition & 4 deletions files/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ class FileMetadata(models.Model):
etag = models.CharField(max_length=255, null=True)
mime_type = models.CharField(max_length=255, default="", blank=True)
total_bytes = models.PositiveIntegerField(default=0, blank=True)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.export_links = {}
export_links: dict[str, str] | None = None

def __str__(self):
ret = f"{self.name or 'Unnamed'} - {self.mime_type}"
Expand Down

0 comments on commit 6e72eed

Please sign in to comment.