Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ninalopatina add file type support #2

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 24 additions & 18 deletions backend/danswer/connectors/google_drive/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@
from danswer.file_processing.extract_file_text import docx_to_text
from danswer.file_processing.extract_file_text import pdf_to_text
from danswer.file_processing.extract_file_text import pptx_to_text
from danswer.file_processing.extract_file_text import read_any_file
from danswer.utils.batching import batch_generator
from danswer.utils.logger import setup_logger

from danswer.file_processing.extract_file_text import read_any_file

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean to reorder this import?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no but its fine wherever


logger = setup_logger()

Expand All @@ -65,6 +65,25 @@ class GDriveMimeType(str, Enum):
POWERPOINT = (
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
)
BMP = "image/bmp"
CSV = "text/csv"
EML = "message/rfc822"
EPUB = "application/epub+zip"
HEIC = "image/heic"
HTML = "text/html"
JPEG = "image/jpeg"
PNG = "image/png"
MD = "text/markdown"
MSG = "application/vnd.ms-outlook"
ODT = "application/vnd.oasis.opendocument.text"
ORG = "text/x-org"
P7S = "application/pkcs7-signature"
RST = "text/x-rst"
RTF = "application/rtf"
TIFF = "image/tiff"
TXT = "text/plain"
TSV = "text/tab-separated-values"
XML = "application/xml"


GoogleDriveFileType = dict[str, Any]
Expand Down Expand Up @@ -309,33 +328,20 @@ def get_all_files_batched(

def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
mime_type = file["mimeType"]

if mime_type not in set(item.value for item in GDriveMimeType):
# Unsupported file types can still have a title, finding this way is still useful
return UNSUPPORTED_FILE_TYPE_CONTENT

if mime_type == GDriveMimeType.DOC.value:
response = service.files().export(fileId=file["id"], mimeType="text/plain").execute()
return read_any_file(file=io.BytesIO(response), file_name=file['name'])
elif mime_type == GDriveMimeType.SPREADSHEET.value:
response = service.files().export(fileId=file["id"], mimeType="text/csv").execute()
return read_any_file(file=io.BytesIO(response), file_name=file['name'])
elif mime_type == GDriveMimeType.EXCEL.value:
response = service.files().get_media(fileId=file["id"]).execute()
return read_any_file(file=io.BytesIO(response), file_name=file['name'])
elif mime_type == GDriveMimeType.WORD_DOC.value:
response = service.files().get_media(fileId=file["id"]).execute()
return read_any_file(file=io.BytesIO(response), file_name = file['name']) # DOCX
elif mime_type == GDriveMimeType.PDF.value:
response = service.files().get_media(fileId=file["id"]).execute()
return read_any_file(file=io.BytesIO(response), file_name = file['name']) # PDF
elif mime_type == GDriveMimeType.POWERPOINT.value:
response = service.files().get_media(fileId=file["id"]).execute()
return read_any_file(file=io.BytesIO(response), file_name = file['name']) #PPT
elif mime_type == GDriveMimeType.PPT.value:
response = service.files().export(fileId=file["id"], mimeType="text/plain").execute()
else:
response = service.files().get_media(fileId=file["id"]).execute()
return read_any_file(file=io.BytesIO(response), file_name = file['name']) # PPT

return UNSUPPORTED_FILE_TYPE_CONTENT
return read_any_file(file=io.BytesIO(response), file_name=file['name'])


class GoogleDriveConnector(LoadConnector, PollConnector):
Expand Down
15 changes: 14 additions & 1 deletion backend/danswer/file_processing/extract_file_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@

logger = setup_logger()


TEXT_SECTION_SEPARATOR = "\n\n"


Expand All @@ -43,6 +42,8 @@
".xml",
".yml",
".yaml",
".rst",
".org",
]


Expand All @@ -54,6 +55,18 @@
".eml",
".epub",
".html",
".bmp",
".doc",
".heic",
".jpeg",
".png",
".msg",
".odt",
".p7s",
".ppt",
".rtf",
".tiff",
".xls",
]

def _sdk_partition_request(file: IO[Any], file_name: str, **kwargs) -> operations.PartitionRequest:
Expand Down