Skip to content

Commit

Permalink
[feat]: add authorization identities to GoogleDriveLoader. (#110)
Browse files Browse the repository at this point in the history
* [feat]: add authorization identities to GoogleDriveLoader.
---------

Signed-off-by: Rahul Tripathi <[email protected]>
Co-authored-by: Rahul Tripathi <[email protected]>
  • Loading branch information
rahul-trip and Rahul Tripathi authored Apr 5, 2024
1 parent ca3724a commit 7668e1f
Showing 1 changed file with 75 additions and 11 deletions.
86 changes: 75 additions & 11 deletions libs/community/langchain_google_community/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,52 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
"""The file loader class to use."""
file_loader_kwargs: Dict["str", Any] = {}
"""The file loader kwargs to use."""
load_auth: bool = False
"""Whether to load authorization identities."""

def _get_identity_metadata_from_id(self, id: str) -> List[str]:
"""Fetch the list of people having access to ID file."""
try:
import googleapiclient.errors # type: ignore[import]
from googleapiclient.discovery import build # type: ignore[import]
except ImportError as exc:
raise ImportError(
"You must run "
"`pip install --upgrade "
"google-api-python-client` "
"to load authorization identities."
) from exc

authorized_identities: list = []
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds) # Build the service
try:
permissions = service.permissions().list(fileId=id).execute()
except googleapiclient.errors.HttpError:
print(
f"insufficientFilePermissions: The user does not have sufficient \
permissions to retrieve permission for the file with fileId: {id}"
)
return authorized_identities
except Exception as exc:
print(
f"Error occurred while fetching the permissions for the file with \
fileId: {id}"
)
print(f"Error: {exc}")
return authorized_identities

for perm in permissions.get("permissions", {}):
email_id = (
service.permissions()
.get(fileId=id, permissionId=perm.get("id", ""), fields="emailAddress")
.execute()
.get("emailAddress")
)
if email_id:
authorized_identities.append(email_id)

return authorized_identities

@root_validator
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
Expand Down Expand Up @@ -113,7 +159,7 @@ def _load_credentials(self) -> Any:
)
except ImportError:
raise ImportError(
"You must run "
"Install prerequisites by running: "
"`pip install --upgrade "
"google-api-python-client google-auth-httplib2 "
"google-auth-oauthlib` "
Expand Down Expand Up @@ -157,6 +203,8 @@ def _load_sheet_from_id(self, id: str) -> List[Document]:
sheets_service = build("sheets", "v4", credentials=creds)
spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=id).execute()
sheets = spreadsheet.get("sheets", [])
if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)

documents = []
for sheet in sheets:
Expand All @@ -181,6 +229,8 @@ def _load_sheet_from_id(self, id: str) -> List[Document]:
"title": f"{spreadsheet['properties']['title']} - {sheet_name}",
"row": i,
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
content = []
for j, v in enumerate(row):
title = header[j].strip() if len(header) > j else ""
Expand All @@ -201,6 +251,8 @@ def _load_document_from_id(self, id: str) -> Document:

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)

file = (
service.files()
Expand All @@ -227,6 +279,8 @@ def _load_document_from_id(self, id: str) -> Document:
"title": f"{file.get('name')}",
"when": f"{file.get('modifiedTime')}",
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities # type: ignore
return Document(page_content=text, metadata=metadata)

def _load_documents_from_folder(
Expand Down Expand Up @@ -304,6 +358,9 @@ def _load_file_from_id(self, id: str) -> List[Document]:
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)

if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)

file = service.files().get(fileId=id, supportsAllDrives=True).execute()
request = service.files().get_media(fileId=id)
fh = BytesIO()
Expand All @@ -320,6 +377,8 @@ def _load_file_from_id(self, id: str) -> List[Document]:
doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
if "title" not in doc.metadata:
doc.metadata["title"] = f"{file.get('name')}"
if self.load_auth:
doc.metadata["authorized_identities"] = authorized_identities
return docs

else:
Expand All @@ -328,17 +387,22 @@ def _load_file_from_id(self, id: str) -> List[Document]:
content = fh.getvalue()
pdf_reader = PdfReader(BytesIO(content))

return [
Document(
page_content=page.extract_text(),
metadata={
"source": f"https://drive.google.com/file/d/{id}/view",
"title": f"{file.get('name')}",
"page": i,
},
docs = []
for i, page in enumerate(pdf_reader.pages):
metadata = {
"source": f"https://drive.google.com/file/d/{id}/view",
"title": f"{file.get('name')}",
"page": i,
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
docs.append(
Document(
page_content=page.extract_text(),
metadata=metadata,
)
)
for i, page in enumerate(pdf_reader.pages)
]
return docs

def _load_file_from_ids(self) -> List[Document]:
"""Load files from a list of IDs."""
Expand Down

0 comments on commit 7668e1f

Please sign in to comment.