From 7668e1fb1471a2ab0a3f96c87fa0a6b481bc38f6 Mon Sep 17 00:00:00 2001 From: Rahul Triptahi Date: Fri, 5 Apr 2024 16:53:14 +0530 Subject: [PATCH] [feat]: add authorization identities to GoogleDriveLoader. (#110) * [feat]: add authorization identities to GoogleDriveLoader. --------- Signed-off-by: Rahul Tripathi Co-authored-by: Rahul Tripathi --- .../langchain_google_community/drive.py | 86 ++++++++++++++++--- 1 file changed, 75 insertions(+), 11 deletions(-) diff --git a/libs/community/langchain_google_community/drive.py b/libs/community/langchain_google_community/drive.py index 2848608d..90d4fb43 100644 --- a/libs/community/langchain_google_community/drive.py +++ b/libs/community/langchain_google_community/drive.py @@ -45,6 +45,52 @@ class GoogleDriveLoader(BaseLoader, BaseModel): """The file loader class to use.""" file_loader_kwargs: Dict["str", Any] = {} """The file loader kwargs to use.""" + load_auth: bool = False + """Whether to load authorization identities.""" + + def _get_identity_metadata_from_id(self, id: str) -> List[str]: + """Fetch the list of people having access to ID file.""" + try: + import googleapiclient.errors # type: ignore[import] + from googleapiclient.discovery import build # type: ignore[import] + except ImportError as exc: + raise ImportError( + "You must run " + "`pip install --upgrade " + "google-api-python-client` " + "to load authorization identities." + ) from exc + + authorized_identities: list = [] + creds = self._load_credentials() + service = build("drive", "v3", credentials=creds) # Build the service + try: + permissions = service.permissions().list(fileId=id).execute() + except googleapiclient.errors.HttpError: + print( + f"insufficientFilePermissions: The user does not have sufficient \ + permissions to retrieve permission for the file with fileId: {id}" + ) + return authorized_identities + except Exception as exc: + print( + f"Error occurred while fetching the permissions for the file with \ + fileId: {id}" + ) + print(f"Error: {exc}") + return authorized_identities + + for perm in permissions.get("permissions", {}): + email_id = ( + service.permissions() + .get(fileId=id, permissionId=perm.get("id", ""), fields="emailAddress") + .execute() + .get("emailAddress") + ) + if email_id: + authorized_identities.append(email_id) + + return authorized_identities @root_validator def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]: @@ -113,7 +159,7 @@ def _load_credentials(self) -> Any: ) except ImportError: raise ImportError( - "You must run " + "Install prerequisites by running: " "`pip install --upgrade " "google-api-python-client google-auth-httplib2 " "google-auth-oauthlib` " @@ -157,6 +203,8 @@ def _load_sheet_from_id(self, id: str) -> List[Document]: sheets_service = build("sheets", "v4", credentials=creds) spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=id).execute() sheets = spreadsheet.get("sheets", []) + if self.load_auth: + authorized_identities = self._get_identity_metadata_from_id(id) documents = [] for sheet in sheets: @@ -181,6 +229,8 @@ def _load_sheet_from_id(self, id: str) -> List[Document]: "title": f"{spreadsheet['properties']['title']} - {sheet_name}", "row": i, } + if self.load_auth: + metadata["authorized_identities"] = authorized_identities content = [] for j, v in enumerate(row): title = header[j].strip() if len(header) > j else "" @@ -201,6 +251,8 @@ def _load_document_from_id(self, id: str) -> Document: creds = self._load_credentials() service = build("drive", "v3", credentials=creds) + if self.load_auth: + authorized_identities = self._get_identity_metadata_from_id(id) file = ( service.files() @@ -227,6 +279,8 @@ def _load_document_from_id(self, id: str) -> Document: "title": f"{file.get('name')}", "when": f"{file.get('modifiedTime')}", } + if self.load_auth: + metadata["authorized_identities"] = authorized_identities # type: ignore return Document(page_content=text, metadata=metadata) def _load_documents_from_folder( @@ -304,6 +358,9 @@ def _load_file_from_id(self, id: str) -> List[Document]: creds = self._load_credentials() service = build("drive", "v3", credentials=creds) + if self.load_auth: + authorized_identities = self._get_identity_metadata_from_id(id) + file = service.files().get(fileId=id, supportsAllDrives=True).execute() request = service.files().get_media(fileId=id) fh = BytesIO() @@ -320,6 +377,8 @@ def _load_file_from_id(self, id: str) -> List[Document]: doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view" if "title" not in doc.metadata: doc.metadata["title"] = f"{file.get('name')}" + if self.load_auth: + doc.metadata["authorized_identities"] = authorized_identities return docs else: @@ -328,17 +387,22 @@ def _load_file_from_id(self, id: str) -> List[Document]: content = fh.getvalue() pdf_reader = PdfReader(BytesIO(content)) - return [ - Document( - page_content=page.extract_text(), - metadata={ - "source": f"https://drive.google.com/file/d/{id}/view", - "title": f"{file.get('name')}", - "page": i, - }, + docs = [] + for i, page in enumerate(pdf_reader.pages): + metadata = { + "source": f"https://drive.google.com/file/d/{id}/view", + "title": f"{file.get('name')}", + "page": i, + } + if self.load_auth: + metadata["authorized_identities"] = authorized_identities + docs.append( + Document( + page_content=page.extract_text(), + metadata=metadata, + ) ) - for i, page in enumerate(pdf_reader.pages) - ] + return docs def _load_file_from_ids(self) -> List[Document]: """Load files from a list of IDs."""