Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ASPX sharepointing support. #9

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions gbb_ai/sharepoint_data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,3 +729,57 @@ def _format_metadata(
"read_access_entity": users_by_role,
}
return formatted_metadata

def get_all_site_pages(self, site_id: str) -> List[Dict[str, Any]]:
"""
Retrieves all the site pages from a given SharePoint site.

:param site_id: The site ID in Microsoft Graph.
:return: A list of dictionaries containing information about each page.
"""
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/pages"
try:
pages = self._make_ms_graph_request(url)
return pages.get("value", [])
except Exception as err:
logger.error(f"Error retrieving site pages: {err}")
return []

def _get_page_content(self, site_id: str, page_id: str) -> Optional[Dict[str, Any]]:
"""
Retrieves the content of a specific site page using the page ID.

:param site_id: The site ID in Microsoft Graph.
:param page_id: The ID of the page to retrieve content from.
:return: A dictionary containing the page content, including canvas layout.
"""
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/pages/{page_id}/microsoft.graph.sitePage?$expand=canvasLayout"
try:
page_content = self._make_ms_graph_request(url)
return page_content
except Exception as err:
logger.error(f"Error retrieving page content: {err}")
return None

def retrieve_and_process_site_pages(self, site_id: str) -> List[Dict[str, Any]]:
"""
Retrieves all site pages and processes each page's content.

:param site_id: The site ID in Microsoft Graph.
:return: A list of processed pages with their content.
"""
all_pages = self.get_all_site_pages(site_id)
processed_pages = []

for page in all_pages:
page_id = page.get("id")
if page_id:
page_content = self._get_page_content(site_id, page_id)
if page_content:
# Here you can process the page content, e.g., chunking, etc.
processed_pages.append({
"page_id": page_id,
"content": page_content
})

return processed_pages
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ requests>=2,<3
msal>=0.6.1,<2
python-docx
python-dotenv
#azure_search_documents==11.4.0b11
azure-search-documents==11.4.0b8
azure_search_documents==11.4.0b11
#azure-search-documents==11.4.0b8
azure-ai-formrecognizer
openai==0.27.10
langchain
tiktoken
PyPDF2
openai==1.5.0
tenacity
bs4=0.0.2
89 changes: 62 additions & 27 deletions vectors-01-create-index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,32 +18,38 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import requests\n",
"from azure.core.credentials import AzureKeyCredential \n",
"from azure.search.documents import SearchClient \n",
"from azure.search.documents.indexes import SearchIndexClient \n",
"from azure.search.documents.models import (\n",
" RawVectorQuery,\n",
")\n",
"from azure.search.documents.indexes.models import ( \n",
" CorsOptions,\n",
" ExhaustiveKnnParameters, \n",
" ExhaustiveKnnVectorSearchAlgorithmConfiguration,\n",
" HnswParameters, \n",
" HnswVectorSearchAlgorithmConfiguration,\n",
" SimpleField,\n",
" SearchField, \n",
" ComplexField,\n",
" SearchFieldDataType, \n",
" SearchIndex, \n",
" VectorSearch, \n",
" VectorSearchAlgorithmKind, \n",
" VectorSearchProfile, \n",
" VectorSearch,\n",
" VectorSearchAlgorithmKind,\n",
" VectorSearchProfile,\n",
")\n",
" \n",
"\n",
Expand All @@ -65,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -87,9 +93,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index sharepoint-site deleted\n"
]
}
],
"source": [
"# Delete the index if it exists\n",
"try:\n",
Expand All @@ -101,9 +115,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index sharepoint-site created\n"
]
}
],
"source": [
"# Create the index\n",
"fields = [\n",
Expand Down Expand Up @@ -182,13 +204,33 @@
" collection=True,\n",
" fields=[SimpleField(name=\"list_item\", type=SearchFieldDataType.String, searchable=True, filterable=True,)],\n",
" searchable=True),\n",
"\n",
" # Security field as collection of strings, filterable, not retrievable\n",
" SimpleField(\n",
" name=\"security\",\n",
" type=SearchFieldDataType.Collection(SearchFieldDataType.String),\n",
" filterable=True,\n",
" retrievable=False, # Ensures the field is not returned in search results\n",
" ),\n",
" # Allowed users field\n",
" SimpleField(\n",
" name=\"allowedUsers\",\n",
" type=SearchFieldDataType.Collection(SearchFieldDataType.String),\n",
" filterable=True,\n",
" retrievable=False, # Ensures this field is not returned in search results\n",
" ),\n",
" # Allowed groups field\n",
" SimpleField(\n",
" name=\"allowedGroups\",\n",
" type=SearchFieldDataType.Collection(SearchFieldDataType.String),\n",
" filterable=True,\n",
" retrievable=False, # Ensures this field is not returned in search results\n",
" ),\n",
"]\n",
"\n",
"cors_options = CorsOptions(allowed_origins=[\"*\"], max_age_in_seconds=60)\n",
"scoring_profiles = []\n",
"suggester = [{\"name\": \"sg\", \"source_fields\": [\"name\"]}]\n",
"\n",
"\n",
"# Configure the vector search configuration \n",
"vector_search = VectorSearch( \n",
" algorithms=[ \n",
Expand All @@ -203,13 +245,13 @@
" ), \n",
" )\n",
" ], \n",
" profiles=[ \n",
" profiles=[ \n",
" VectorSearchProfile( \n",
" name=\"myHnswProfile\", \n",
" algorithm=\"myHnsw\", \n",
" ), \n",
" ], \n",
") \n",
")\n",
"\n",
"index = SearchIndex(\n",
" name=os.environ[\"SEARCH_INDEX_NAME\"],\n",
Expand All @@ -226,13 +268,6 @@
"except Exception as ex:\n",
" print(ex)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand All @@ -251,7 +286,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.9.19"
}
},
"nbformat": 4,
Expand Down
Loading