Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support all non exportable google workspace docs #572

Merged
merged 5 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 15 additions & 32 deletions daras_ai_v2/gdrive_downloader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import io

import typing
from furl import furl
import requests

from daras_ai_v2.exceptions import UserError
from daras_ai_v2.functional import flatmap_parallel
from daras_ai_v2.exceptions import raise_for_status

docs_export_mimetype = {
devxpy marked this conversation as resolved.
Show resolved Hide resolved
"application/vnd.google-apps.document": "text/plain",
"application/vnd.google-apps.spreadsheet": "text/csv",
"application/vnd.google-apps.presentation": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.google-apps.drawing": "application/pdf",
}


def is_gdrive_url(f: furl) -> bool:
return f.host in ["drive.google.com", "docs.google.com"]
Expand Down Expand Up @@ -62,18 +69,23 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str
return filter(None, urls)


def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]:
def gdrive_download(
f: furl, mime_type: str, export_links: typing.Optional[dict] = None
) -> tuple[bytes, str]:
from googleapiclient import discovery
from googleapiclient.http import MediaIoBaseDownload

if export_links is None:
export_links = {}

# get drive file id
file_id = url_to_gdrive_file_id(f)
# get metadata
service = discovery.build("drive", "v3")

if f.host != "drive.google.com":
# export google docs to appropriate type
export_mime_type, _ = docs_export_mimetype(f)
export_mime_type = docs_export_mimetype.get(mime_type, mime_type)
if f_url_export := export_links.get(export_mime_type, None):
devxpy marked this conversation as resolved.
Show resolved Hide resolved
r = requests.get(f_url_export)
file_bytes = r.content
Expand All @@ -96,35 +108,6 @@ def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes,
return file_bytes, mime_type


def docs_export_mimetype(f: furl) -> tuple[str, str]:
"""
return the mimetype to export google docs - https://developers.google.com/drive/api/guides/ref-export-formats

Args:
f (furl): google docs link

Returns:
tuple[str, str]: (mime_type, extension)
"""
if "document" in f.path.segments:
mime_type = "text/plain"
ext = ".txt"
elif "spreadsheets" in f.path.segments:
mime_type = "text/csv"
ext = ".csv"
elif "presentation" in f.path.segments:
mime_type = (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
ext = ".pptx"
elif "drawings" in f.path.segments:
mime_type = "application/pdf"
ext = ".pdf"
else:
raise ValueError(f"Not sure how to export google docs url: {str(f)!r}")
return mime_type, ext


def gdrive_metadata(file_id: str) -> dict:
from googleapiclient import discovery

Expand Down
6 changes: 3 additions & 3 deletions daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = meta.get("md5Checksum") or meta.get("modifiedTime")
mime_type = meta["mimeType"]
total_bytes = int(meta.get("size") or 0)
export_links = meta.get("exportLinks", {})
export_links = meta.get("exportLinks", None)
else:
try:
if is_user_uploaded_url(f_url):
Expand All @@ -328,7 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
mime_type = None
etag = None
total_bytes = 0
export_links = {}
export_links = None
else:
name = (
r.headers.get("content-disposition", "")
Expand All @@ -340,7 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = etag.strip('"')
mime_type = get_mimetype_from_response(r)
total_bytes = int(r.headers.get("content-length") or 0)
export_links = {}
export_links = None
# extract filename from url as a fallback
if not name:
if is_user_uploaded_url(f_url):
Expand Down
Loading