Skip to content

Commit

Permalink
[feat]: add authorization identities to GoogleDriveLoader.
Browse files Browse the repository at this point in the history
Signed-off-by: Rahul Tripathi <[email protected]>
  • Loading branch information
Rahul Tripathi committed Apr 2, 2024
1 parent a13f396 commit 52f684a
Showing 1 changed file with 58 additions and 10 deletions.
68 changes: 58 additions & 10 deletions libs/community/langchain_google_community/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,36 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
"""The file loader class to use."""
file_loader_kwargs: Dict["str", Any] = {}
"""The file loader kwargs to use."""
load_auth: bool = False
"""Whether to load authorization identities."""

def _get_identity_metadata_from_id(self, id: str):
"""Fetch the list of people having access to ID file."""
try:
from googleapiclient.discovery import build
except ImportError as exc:
raise ImportError(
"You must run "
"`pip install --upgrade "
"google-api-python-client` "
"to load authorization identities."
) from exc

authorized_identities = []
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds) # Build the service
permissions = service.permissions().list(fileId=id).execute()
for perm in permissions.get("permissions", {}):
email_id = (
service.permissions()
.get(fileId=id, permissionId=perm.get("id", ""), fields="emailAddress")
.execute()
.get("emailAddress")
)
if email_id:
authorized_identities.append(email_id)

return authorized_identities

@root_validator
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
Expand Down Expand Up @@ -157,6 +187,8 @@ def _load_sheet_from_id(self, id: str) -> List[Document]:
sheets_service = build("sheets", "v4", credentials=creds)
spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=id).execute()
sheets = spreadsheet.get("sheets", [])
if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)

documents = []
for sheet in sheets:
Expand All @@ -181,6 +213,8 @@ def _load_sheet_from_id(self, id: str) -> List[Document]:
"title": f"{spreadsheet['properties']['title']} - {sheet_name}",
"row": i,
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
content = []
for j, v in enumerate(row):
title = header[j].strip() if len(header) > j else ""
Expand All @@ -201,6 +235,8 @@ def _load_document_from_id(self, id: str) -> Document:

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)

file = (
service.files()
Expand All @@ -227,6 +263,8 @@ def _load_document_from_id(self, id: str) -> Document:
"title": f"{file.get('name')}",
"when": f"{file.get('modifiedTime')}",
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
return Document(page_content=text, metadata=metadata)

def _load_documents_from_folder(
Expand Down Expand Up @@ -304,6 +342,9 @@ def _load_file_from_id(self, id: str) -> List[Document]:
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)

if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)

file = service.files().get(fileId=id, supportsAllDrives=True).execute()
request = service.files().get_media(fileId=id)
fh = BytesIO()
Expand All @@ -320,6 +361,8 @@ def _load_file_from_id(self, id: str) -> List[Document]:
doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
if "title" not in doc.metadata:
doc.metadata["title"] = f"{file.get('name')}"
if self.load_auth:
doc.metadata["authorized_identities"] = authorized_identities
return docs

else:
Expand All @@ -328,17 +371,22 @@ def _load_file_from_id(self, id: str) -> List[Document]:
content = fh.getvalue()
pdf_reader = PdfReader(BytesIO(content))

return [
Document(
page_content=page.extract_text(),
metadata={
"source": f"https://drive.google.com/file/d/{id}/view",
"title": f"{file.get('name')}",
"page": i,
},
docs = []
for i, page in enumerate(pdf_reader.pages):
metadata = {
"source": f"https://drive.google.com/file/d/{id}/view",
"title": f"{file.get('name')}",
"page": i,
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
docs.append(
Document(
page_content=page.extract_text(),
metadata=metadata,
)
)
for i, page in enumerate(pdf_reader.pages)
]
return docs

def _load_file_from_ids(self) -> List[Document]:
"""Load files from a list of IDs."""
Expand Down

0 comments on commit 52f684a

Please sign in to comment.