diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/.env.sample b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/.env.sample index 143cecc6b..ee6cb6f5d 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/.env.sample +++ b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/.env.sample @@ -5,6 +5,7 @@ OPENAI_API_KEY= O365_CLIENT_ID= O365_CLIENT_SECRET= O365_TENANT_ID= +SHAREPOINT_SITE_URL= # Pebblo configuration PEBBLO_CLOUD_URL= diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/README.md b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/README.md index 7975ee29a..29025f1b9 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/README.md +++ b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/README.md @@ -42,6 +42,7 @@ OPENAI_API_KEY= O365_CLIENT_ID= O365_CLIENT_SECRET= O365_TENANT_ID= +SHAREPOINT_SITE_URL= # Postgres configuration PG_CONNECTION_STRING = "postgresql://:@:/" diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/msgraph_api_auth.py b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/msgraph_api_auth.py index 2b8ec3e91..70e593e03 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/msgraph_api_auth.py +++ b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/msgraph_api_auth.py @@ -127,6 +127,88 @@ def get_access_token(self): else: return response.json()["access_token"] + @staticmethod + def format_site_url(site_url: str): + """ + Formats the site URL to include the colon(:) in the URL as required by the Microsoft Graph API. + Example: + 1. Default site URL: + input: https://.sharepoint.com/ + output: tenant.sharepoint.com + 2. Custom site URL: + input: https://.sharepoint.com/sites/ + output: tenant.sharepoint.com:/sites/ + + :param site_url: The original SharePoint site URL. + :return: The formatted site URL with a colon after the tenant domain. + """ + + # Check if the site URL contains the "/sites/" substring and format the URL accordingly + if "/sites/" in site_url: + parts = site_url.split("/sites/") + if parts[0].endswith(":"): + # If the URL already contains a colon, use the URL as is + formatted_url = site_url + else: + # Add a colon after the tenant domain + formatted_url = f"{parts[0]}:/sites/{parts[1]}" + else: + formatted_url = site_url + + # Remove the https:// prefix from the site URL + formatted_url = formatted_url.replace("https://", "") + return formatted_url + + def get_site_id(self, site_url): + """ + This function retrieves the ID of a SharePoint site using the Microsoft Graph API. + + Parameters: + site_url (str): The URL of the SharePoint site. + + Returns: + str: The ID of the SharePoint site. + """ + # Format the site URL + site_url = self.format_site_url(site_url) + # Build URL to request site ID + full_url = f"https://graph.microsoft.com/v1.0/sites/{site_url}" + response = requests.get( + full_url, headers={"Authorization": f"Bearer {self.access_token}"} + ) + site_id = response.json().get("id") # Return the site ID + return site_id + + def get_drive_id(self, site_id): + """ + This function retrieves the IDs and names of all drives associated with a specified SharePoint site. + + Parameters: + site_id (str): The ID of the SharePoint site. + + Returns: + list: A list of dictionaries. Each dictionary represents a drive on the SharePoint site. + Each dictionary contains the following keys: + - 'id': The ID of the drive. + - 'name': The name of the drive. + """ + + # Retrieve drive IDs and names associated with a site + try: + drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives" + response = requests.get(drives_url, headers=self.headers) + drives = response.json().get("value", []) + drive_info = [ + ({"id": drive["id"], "name": drive["name"]}) for drive in drives + ] + # print(f"Drive Info: {drive_info}") + return drive_info + except requests.exceptions.HTTPError as e: + print( + f"Error while retrieving document library ID from Microsoft Graph API, Error: {e}" + ) + return [] + if __name__ == "__main__": pass diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/pebblo_saferag.py b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/pebblo_saferag.py index bad28d215..593257a8a 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/pebblo_saferag.py +++ b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/pebblo_saferag.py @@ -158,11 +158,52 @@ def ask( return self.retrieval_chain.invoke(chain_input.dict()) +def select_drive(drives: list) -> tuple: + """ + Select SharePoint drive from the available drives + """ + if not drives: + print("No drives found for the site. Exiting ...") + exit(1) + elif len(drives) == 1: + _drive_id = drives[0].get("id") + _drive_name = drives[0].get("name") + else: + # Select "Documents" as a default drive + def_drive_idx = next( + ( + idx + for idx, drive in enumerate(drives) + if drive.get("name") == "Documents" + ), + 0, + ) + # Select drive + # print("Select a drive ...") + print("Available drives on the site:") + for idx, drive in enumerate(drives): + print(f"\t{idx + 1}. {drive.get('name')}") + + # Prompt user for drive index + _drive_idx = input(f"Enter drive index (default={def_drive_idx + 1}): ") + _drive_idx = int(_drive_idx) - 1 if _drive_idx else def_drive_idx + # Validate drive index and select default drive if invalid + if _drive_idx < 0 or _drive_idx >= len(drives): + print("Error. Invalid drive index! Selecting the default drive ...") + _drive_idx = def_drive_idx + + # Get drive info + _drive_id = drives[_drive_idx].get("id") + _drive_name = drives[_drive_idx].get("name") + return _drive_id, _drive_name + + if __name__ == "__main__": input_collection_name = "identity-enabled-rag-sharepoint" _client_id = os.environ.get("O365_CLIENT_ID") _client_secret = os.environ.get("O365_CLIENT_SECRET") _tenant_id = os.environ.get("O365_TENANT_ID") + _site_url = os.environ.get("SHAREPOINT_SITE_URL") print("Please enter the app details to authenticate with Microsoft Graph API ...") app_client_id = input(f"App client id ({_client_id}): ") or _client_id @@ -171,12 +212,38 @@ def ask( ) tenant_id = input(f"Tenant id ({_tenant_id}): ") or _tenant_id - print("\nPlease enter drive id for loading data...") - drive_id = input("Drive id : ") + print("\nInitializing SharepointADHelper ...") + sharepoint_helper = SharepointADHelper( + client_id=app_client_id, + client_secret=app_client_secret, + tenant_id=tenant_id, + ) + print("SharepointADHelper initialized.") + site_url = ( + input(f"\nEnter Sharepoint Site URL (default={_site_url}): ") or _site_url + ) + if not site_url: + print("\nSite URL is required. Exiting ...") + exit(1) + + # Get SharePoint Site ID using URL + site_id = sharepoint_helper.get_site_id(site_url) + print(f"Derived Site Id: {site_id}\n") + + # Get drive info using site id + print("Fetching drive info ...") + drive_info = sharepoint_helper.get_drive_id(site_id) + drive_id, drive_name = select_drive(drive_info) + print(f"\nSharePoint Drive name: {drive_name}, Drive Id: {drive_id}") + + # Enter Folder path + folder_path = input("\nEnter folder path (default='/document'): ") or "/document" + + # Initialize PebbloSafeRAG app rag_app = PebbloSafeRAG( drive_id=drive_id, - folder_path="/document", + folder_path=folder_path, collection_name=input_collection_name, ) @@ -202,11 +269,9 @@ def ask( prompt = input("Please provide the prompt : ") - authorized_identities = SharepointADHelper( - client_id=app_client_id, - client_secret=app_client_secret, - tenant_id=tenant_id, - ).get_authorized_identities(end_user_email_address) + authorized_identities = sharepoint_helper.get_authorized_identities( + end_user_email_address + ) response = rag_app.ask( prompt, diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/.env.sample b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/.env.sample index d42f7f586..d5cb4f521 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/.env.sample +++ b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/.env.sample @@ -5,6 +5,7 @@ OPENAI_API_KEY= O365_CLIENT_ID= O365_CLIENT_SECRET= O365_TENANT_ID= +SHAREPOINT_SITE_URL= # Pebblo configuration PEBBLO_CLOUD_URL= diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/README.md b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/README.md index 38a685278..9aa505672 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/README.md +++ b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/README.md @@ -32,6 +32,7 @@ OPENAI_API_KEY= O365_CLIENT_ID= O365_CLIENT_SECRET= O365_TENANT_ID= +SHAREPOINT_SITE_URL= # Pebblo Cloud configuration (optional) PEBBLO_CLOUD_URL= diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/msgraph_api_auth.py b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/msgraph_api_auth.py index 2b8ec3e91..70e593e03 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/msgraph_api_auth.py +++ b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/msgraph_api_auth.py @@ -127,6 +127,88 @@ def get_access_token(self): else: return response.json()["access_token"] + @staticmethod + def format_site_url(site_url: str): + """ + Formats the site URL to include the colon(:) in the URL as required by the Microsoft Graph API. + Example: + 1. Default site URL: + input: https://.sharepoint.com/ + output: tenant.sharepoint.com + 2. Custom site URL: + input: https://.sharepoint.com/sites/ + output: tenant.sharepoint.com:/sites/ + + :param site_url: The original SharePoint site URL. + :return: The formatted site URL with a colon after the tenant domain. + """ + + # Check if the site URL contains the "/sites/" substring and format the URL accordingly + if "/sites/" in site_url: + parts = site_url.split("/sites/") + if parts[0].endswith(":"): + # If the URL already contains a colon, use the URL as is + formatted_url = site_url + else: + # Add a colon after the tenant domain + formatted_url = f"{parts[0]}:/sites/{parts[1]}" + else: + formatted_url = site_url + + # Remove the https:// prefix from the site URL + formatted_url = formatted_url.replace("https://", "") + return formatted_url + + def get_site_id(self, site_url): + """ + This function retrieves the ID of a SharePoint site using the Microsoft Graph API. + + Parameters: + site_url (str): The URL of the SharePoint site. + + Returns: + str: The ID of the SharePoint site. + """ + # Format the site URL + site_url = self.format_site_url(site_url) + # Build URL to request site ID + full_url = f"https://graph.microsoft.com/v1.0/sites/{site_url}" + response = requests.get( + full_url, headers={"Authorization": f"Bearer {self.access_token}"} + ) + site_id = response.json().get("id") # Return the site ID + return site_id + + def get_drive_id(self, site_id): + """ + This function retrieves the IDs and names of all drives associated with a specified SharePoint site. + + Parameters: + site_id (str): The ID of the SharePoint site. + + Returns: + list: A list of dictionaries. Each dictionary represents a drive on the SharePoint site. + Each dictionary contains the following keys: + - 'id': The ID of the drive. + - 'name': The name of the drive. + """ + + # Retrieve drive IDs and names associated with a site + try: + drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives" + response = requests.get(drives_url, headers=self.headers) + drives = response.json().get("value", []) + drive_info = [ + ({"id": drive["id"], "name": drive["name"]}) for drive in drives + ] + # print(f"Drive Info: {drive_info}") + return drive_info + except requests.exceptions.HTTPError as e: + print( + f"Error while retrieving document library ID from Microsoft Graph API, Error: {e}" + ) + return [] + if __name__ == "__main__": pass diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/pebblo_saferag.py b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/pebblo_saferag.py index 0617dfb43..1dc32a4a8 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/pebblo_saferag.py +++ b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/pebblo_saferag.py @@ -128,12 +128,53 @@ def ask( return self.retrieval_chain.invoke(chain_input.dict()) +def select_drive(drives: list) -> tuple: + """ + Select SharePoint drive from the available drives + """ + if not drives: + print("No drives found for the site. Exiting ...") + exit(1) + elif len(drives) == 1: + _drive_id = drives[0].get("id") + _drive_name = drives[0].get("name") + else: + # Select "Documents" as a default drive + def_drive_idx = next( + ( + idx + for idx, drive in enumerate(drives) + if drive.get("name") == "Documents" + ), + 0, + ) + # Select drive + # print("Select a drive ...") + print("Available drives on the site:") + for idx, drive in enumerate(drives): + print(f"\t{idx + 1}. {drive.get('name')}") + + # Prompt user for drive index + _drive_idx = input(f"Enter drive index (default={def_drive_idx + 1}): ") + _drive_idx = int(_drive_idx) - 1 if _drive_idx else def_drive_idx + # Validate drive index and select default drive if invalid + if _drive_idx < 0 or _drive_idx >= len(drives): + print("Error. Invalid drive index! Selecting the default drive ...") + _drive_idx = def_drive_idx + + # Get drive info + _drive_id = drives[_drive_idx].get("id") + _drive_name = drives[_drive_idx].get("name") + return _drive_id, _drive_name + + if __name__ == "__main__": input_collection_name = "identity-enabled-rag" _client_id = os.environ.get("O365_CLIENT_ID") _client_secret = os.environ.get("O365_CLIENT_SECRET") _tenant_id = os.environ.get("O365_TENANT_ID") + _site_url = os.environ.get("SHAREPOINT_SITE_URL") print("Please enter the app details to authenticate with Microsoft Graph API ...") app_client_id = input(f"App client id ({_client_id}): ") or _client_id @@ -142,12 +183,38 @@ def ask( ) tenant_id = input(f"Tenant id ({_tenant_id}): ") or _tenant_id - print("\nPlease enter drive id for loading data...") - drive_id = input("Drive id : ") + print("\nInitializing SharepointADHelper ...") + sharepoint_helper = SharepointADHelper( + client_id=app_client_id, + client_secret=app_client_secret, + tenant_id=tenant_id, + ) + print("SharepointADHelper initialized.") + + site_url = ( + input(f"\nEnter Sharepoint Site URL (default={_site_url}): ") or _site_url + ) + if not site_url: + print("\nSite URL is required. Exiting ...") + exit(1) + + # Get SharePoint Site ID using URL + site_id = sharepoint_helper.get_site_id(site_url) + print(f"Derived Site Id: {site_id}\n") + + # Get drive info using site id + print("Fetching drive info ...") + drive_info = sharepoint_helper.get_drive_id(site_id) + drive_id, drive_name = select_drive(drive_info) + print(f"\nSharePoint Drive name: {drive_name}, Drive Id: {drive_id}") + + # Enter Folder path + folder_path = input("\nEnter folder path (default='/document'): ") or "/document" + # Initialize PebbloSafeRAG app rag_app = PebbloSafeRAG( drive_id=drive_id, - folder_path="/document", + folder_path=folder_path, collection_name=input_collection_name, )