Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: pptx uploaded on google docs #540

Merged
merged 7 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 25 additions & 19 deletions daras_ai_v2/gdrive_downloader.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import io

from furl import furl
import requests

from daras_ai_v2.exceptions import UserError
from daras_ai_v2.functional import flatmap_parallel
from daras_ai_v2.exceptions import raise_for_status


def is_gdrive_url(f: furl) -> bool:
Expand Down Expand Up @@ -60,36 +62,38 @@ def gdrive_list_urls_of_files_in_folder(f: furl, max_depth: int = 4) -> list[str
return filter(None, urls)


def gdrive_download(f: furl, mime_type: str) -> tuple[bytes, str]:
def gdrive_download(f: furl, mime_type: str, export_links: dict) -> tuple[bytes, str]:
from googleapiclient import discovery
from googleapiclient.http import MediaIoBaseDownload

# get drive file id
file_id = url_to_gdrive_file_id(f)
# get metadata
service = discovery.build("drive", "v3")
# get files in drive directly
if f.host == "drive.google.com":
request = service.files().get_media(
fileId=file_id,
supportsAllDrives=True,
)
# export google docs to appropriate type
else:
mime_type, _ = docs_export_mimetype(f)
request = service.files().export_media(
fileId=file_id,
mimeType=mime_type,
)

if f.host != "drive.google.com":
# export google docs to appropriate type
export_mime_type, _ = docs_export_mimetype(f)
if f_url_export := export_links.get(export_mime_type, None):
r = requests.get(f_url_export)
file_bytes = r.content
raise_for_status(r, is_user_url=True)
return file_bytes, export_mime_type

request = service.files().get_media(
fileId=file_id,
supportsAllDrives=True,
)
# download
file = io.BytesIO()
downloader = MediaIoBaseDownload(file, request)
done = False
while done is False:
_, done = downloader.next_chunk()
# print(f"Download {int(status.progress() * 100)}%")
f_bytes = file.getvalue()
return f_bytes, mime_type
file_bytes = file.getvalue()

return file_bytes, mime_type


def docs_export_mimetype(f: furl) -> tuple[str, str]:
Expand All @@ -109,8 +113,10 @@ def docs_export_mimetype(f: furl) -> tuple[str, str]:
mime_type = "text/csv"
ext = ".csv"
elif "presentation" in f.path.segments:
mime_type = "application/pdf"
ext = ".pdf"
mime_type = (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
ext = ".pptx"
elif "drawings" in f.path.segments:
mime_type = "application/pdf"
ext = ".pdf"
Expand All @@ -128,7 +134,7 @@ def gdrive_metadata(file_id: str) -> dict:
.get(
supportsAllDrives=True,
fileId=file_id,
fields="name,md5Checksum,modifiedTime,mimeType,size",
fields="name,md5Checksum,modifiedTime,mimeType,size,exportLinks",
)
.execute()
)
Expand Down
2 changes: 1 addition & 1 deletion daras_ai_v2/glossary.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def validate_glossary_document(document: str):

metadata = doc_url_to_file_metadata(document)
f_bytes, mime_type = download_content_bytes(
f_url=document, mime_type=metadata.mime_type
f_url=document, mime_type=metadata.mime_type, export_links=metadata.export_links
)
df = tabular_bytes_to_str_df(
f_name=metadata.name, f_bytes=f_bytes, mime_type=mime_type
Expand Down
125 changes: 57 additions & 68 deletions daras_ai_v2/office_utils_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ def pptx_to_text_pages(f: typing.BinaryIO, use_form_reco: bool = False) -> list[
except Exception as e:
slide_content.append(f" Error processing shape: {e}")

if slide.has_notes_slide:
slide_content.extend(handle_author_notes(slide))

slides_text.append("\n".join(slide_content) + "\n")

return slides_text


Expand All @@ -43,81 +47,55 @@ def handle_text_elements(shape) -> list[str]:
Handles text elements within a shape, including lists.
"""
text_elements = []
is_a_list = False
is_list_group_created = False
enum_list_item_value = 0
bullet_type = "None"
list_label = "LIST"
namespaces = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}

# Identify if shape contains lists
current_list_type = None
list_item_index = 0

for paragraph in shape.text_frame.paragraphs:
p = paragraph._element
paragraph_text = ""
is_list_item = False

# Determine list type
if p.find(".//a:buChar", namespaces=namespaces) is not None:
bullet_type = "Bullet"
is_a_list = True
current_list_type = "Bullet"
is_list_item = True
elif p.find(".//a:buAutoNum", namespaces=namespaces) is not None:
bullet_type = "Numbered"
is_a_list = True
current_list_type = "Numbered"
is_list_item = True
elif paragraph.level > 0: # Indented text is also treated as a list
current_list_type = "Bullet"
is_list_item = True
else:
is_a_list = False

if paragraph.level > 0:
is_a_list = True

if is_a_list:
if bullet_type == "Numbered":
list_label = "ORDERED_LIST"

# Iterate through paragraphs to build up text
for paragraph in shape.text_frame.paragraphs:
p = paragraph._element
enum_list_item_value += 1
inline_paragraph_text = ""
inline_list_item_text = ""
doc_label = "PARAGRAPH"

for e in p.iterfind(".//a:r", namespaces=namespaces):
if len(e.text.strip()) > 0:
e_is_a_list_item = False
is_numbered = False
if p.find(".//a:buChar", namespaces=namespaces) is not None:
bullet_type = "Bullet"
e_is_a_list_item = True
elif p.find(".//a:buAutoNum", namespaces=namespaces) is not None:
bullet_type = "Numbered"
is_numbered = True
e_is_a_list_item = True
else:
e_is_a_list_item = False

if e_is_a_list_item:
if len(inline_paragraph_text) > 0:
text_elements.append(inline_paragraph_text)
inline_list_item_text += e.text
current_list_type = None
list_item_index = 0 # Reset numbering if no list

# Process paragraph text
for run in p.iterfind(".//a:r", namespaces=namespaces):
run_text = run.text.strip() if run.text else ""
if run_text:
paragraph_text += run_text

if is_list_item:
if current_list_type == "Numbered":
list_item_index += 1
list_prefix = f"{list_item_index}."
else:
list_prefix = "•" # Default bullet symbol
text_elements.append(f"{list_prefix} {paragraph_text}")
else:
# Handle placeholders for titles or subtitles
if shape.is_placeholder:
placeholder_type = shape.placeholder_format.type
if placeholder_type == PP_PLACEHOLDER.TITLE:
text_elements.append(f"TITLE: {paragraph_text}")
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
text_elements.append(f"SECTION_HEADER: {paragraph_text}")
else:
if shape.is_placeholder:
placeholder_type = shape.placeholder_format.type
if placeholder_type in [
PP_PLACEHOLDER.CENTER_TITLE,
PP_PLACEHOLDER.TITLE,
]:
doc_label = "TITLE"
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
doc_label = "SECTION_HEADER"
enum_list_item_value = 0
inline_paragraph_text += e.text

if len(inline_paragraph_text) > 0:
text_elements.append(inline_paragraph_text)

if len(inline_list_item_text) > 0:
enum_marker = ""
if is_numbered:
enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created:
is_list_group_created = True
text_elements.append(f"{enum_marker} {inline_list_item_text}")
text_elements.append(paragraph_text)
else:
text_elements.append(paragraph_text)

return text_elements

Expand Down Expand Up @@ -171,7 +149,7 @@ def handle_tables(shape) -> list[str]:
for row in grid[1:]:
line = "|" + "|".join(row) + "|"
table_text.append(line)
print(line)
# print(line)

return table_text

Expand Down Expand Up @@ -207,6 +185,17 @@ def handle_charts(shape) -> list[str]:
return chart_text


def handle_author_notes(slide) -> list[str]:

notes = []
if slide.notes_slide.notes_text_frame:
notes_text = slide.notes_slide.notes_text_frame.text.strip()
if notes_text:
notes.append("Speaker Notes:")
notes.append(notes_text)
return notes


# TODO :azure form reco to extract text from images
def handle_pictures(shape):
pass
21 changes: 17 additions & 4 deletions daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = meta.get("md5Checksum") or meta.get("modifiedTime")
mime_type = meta["mimeType"]
total_bytes = int(meta.get("size") or 0)
export_links = meta.get("exportLinks", {})
else:
try:
if is_user_uploaded_url(f_url):
Expand All @@ -327,6 +328,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
mime_type = None
etag = None
total_bytes = 0
export_links = {}
else:
name = (
r.headers.get("content-disposition", "")
Expand All @@ -338,6 +340,7 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
etag = etag.strip('"')
mime_type = get_mimetype_from_response(r)
total_bytes = int(r.headers.get("content-length") or 0)
export_links = {}
# extract filename from url as a fallback
if not name:
if is_user_uploaded_url(f_url):
Expand All @@ -347,9 +350,12 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
# guess mimetype from name as a fallback
if not mime_type:
mime_type = mimetypes.guess_type(name)[0]
return FileMetadata(

file_metadata = FileMetadata(
name=name, etag=etag, mime_type=mime_type or "", total_bytes=total_bytes
)
file_metadata.export_links = export_links or {}
return file_metadata


def yt_dlp_get_video_entries(url: str) -> list[dict]:
Expand Down Expand Up @@ -650,7 +656,10 @@ def doc_url_to_text_pages(
Download document from url and convert to text pages.
"""
f_bytes, mime_type = download_content_bytes(
f_url=f_url, mime_type=file_meta.mime_type, is_user_url=is_user_url
f_url=f_url,
mime_type=file_meta.mime_type,
is_user_url=is_user_url,
export_links=file_meta.export_links,
)
if not f_bytes:
return []
Expand All @@ -664,14 +673,18 @@ def doc_url_to_text_pages(


def download_content_bytes(
*, f_url: str, mime_type: str, is_user_url: bool = True
*,
f_url: str,
mime_type: str,
is_user_url: bool = True,
export_links: dict[str, str] = {},
) -> tuple[bytes, str]:
if is_yt_dlp_able_url(f_url):
return download_youtube_to_wav(f_url), "audio/wav"
f = furl(f_url)
if is_gdrive_url(f):
# download from google drive
return gdrive_download(f, mime_type)
return gdrive_download(f, mime_type, export_links)
try:
# download from url
if is_user_uploaded_url(f_url):
Expand Down
4 changes: 4 additions & 0 deletions files/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ class FileMetadata(models.Model):
mime_type = models.CharField(max_length=255, default="", blank=True)
total_bytes = models.PositiveIntegerField(default=0, blank=True)

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.export_links = {}

def __str__(self):
ret = f"{self.name or 'Unnamed'} - {self.mime_type}"
if self.total_bytes:
Expand Down
2 changes: 1 addition & 1 deletion recipes/BulkRunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ def get_columns(files: list[str]) -> list[str]:
def read_df_any(f_url: str) -> "pd.DataFrame":
file_meta = doc_url_to_file_metadata(f_url)
f_bytes, mime_type = download_content_bytes(
f_url=f_url, mime_type=file_meta.mime_type
f_url=f_url, mime_type=file_meta.mime_type, export_links=file_meta.export_links
)
df = tabular_bytes_to_any_df(
f_name=file_meta.name, f_bytes=f_bytes, mime_type=mime_type
Expand Down
4 changes: 3 additions & 1 deletion recipes/DocExtract.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,7 +475,9 @@ def process_source(
elif is_video:
f = furl(webpage_url)
if is_gdrive_url(f):
f_bytes, _ = gdrive_download(f, doc_meta.mime_type)
f_bytes, _ = gdrive_download(
f, doc_meta.mime_type, doc_meta.export_links
)
webpage_url = upload_file_from_bytes(
doc_meta.name, f_bytes, content_type=doc_meta.mime_type
)
Expand Down
Loading