Skip to content

Commit

Permalink
support confluence single page only indexing (#2008)
Browse files Browse the repository at this point in the history
* added index recursively checkbox

* mypy fixes

* added migration to not break existing connectors
  • Loading branch information
hagen-danswer authored Aug 1, 2024
1 parent a54ea9f commit e6a92aa
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 20 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Rename index_origin to index_recursively
Revision ID: 1d6ad76d1f37
Revises: e1392f05e840
Create Date: 2024-08-01 12:38:54.466081
"""
from alembic import op

# revision identifiers, used by Alembic.
revision = "1d6ad76d1f37"
down_revision = "e1392f05e840"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.execute(
"""
UPDATE connector
SET connector_specific_config = jsonb_set(
connector_specific_config,
'{index_recursively}',
'true'::jsonb
) - 'index_origin'
WHERE connector_specific_config ? 'index_origin'
"""
)


def downgrade() -> None:
op.execute(
"""
UPDATE connector
SET connector_specific_config = jsonb_set(
connector_specific_config,
'{index_origin}',
connector_specific_config->'index_recursively'
) - 'index_recursively'
WHERE connector_specific_config ? 'index_recursively'
"""
)
31 changes: 18 additions & 13 deletions backend/danswer/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,16 +217,19 @@ def __init__(
self,
batch_size: int,
confluence_client: Confluence,
index_origin: bool,
index_recursively: bool,
origin_page_id: str,
) -> None:
self.batch_size = 1
# batch_size
self.confluence_client = confluence_client
self.index_origin = index_origin
self.index_recursively = index_recursively
self.origin_page_id = origin_page_id
self.pages = self.recurse_children_pages(0, self.origin_page_id)

def get_origin_page(self) -> list[dict[str, Any]]:
return [self._fetch_origin_page()]

def get_pages(self, ind: int, size: int) -> list[dict]:
if ind * size > len(self.pages):
return []
Expand Down Expand Up @@ -282,12 +285,11 @@ def recurse_children_pages(
current_level_pages = next_level_pages
next_level_pages = []

if self.index_origin:
try:
origin_page = self._fetch_origin_page()
pages.append(origin_page)
except Exception as e:
logger.warning(f"Appending origin page with id {page_id} failed: {e}")
try:
origin_page = self._fetch_origin_page()
pages.append(origin_page)
except Exception as e:
logger.warning(f"Appending origin page with id {page_id} failed: {e}")

return pages

Expand Down Expand Up @@ -340,7 +342,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
def __init__(
self,
wiki_page_url: str,
index_origin: bool = True,
index_recursively: bool = True,
batch_size: int = INDEX_BATCH_SIZE,
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
# if a page has one of the labels specified in this list, we will just
Expand All @@ -352,7 +354,7 @@ def __init__(
self.continue_on_failure = continue_on_failure
self.labels_to_skip = set(labels_to_skip)
self.recursive_indexer: RecursiveIndexer | None = None
self.index_origin = index_origin
self.index_recursively = index_recursively
(
self.wiki_base,
self.space,
Expand All @@ -369,7 +371,7 @@ def __init__(

logger.info(
f"wiki_base: {self.wiki_base}, space: {self.space}, page_id: {self.page_id},"
+ f" space_level_scan: {self.space_level_scan}, origin: {self.index_origin}"
+ f" space_level_scan: {self.space_level_scan}, index_recursively: {self.index_recursively}"
)

def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
Expand Down Expand Up @@ -453,10 +455,13 @@ def _fetch_page(start_ind: int, batch_size: int) -> list[dict[str, Any]]:
origin_page_id=self.page_id,
batch_size=self.batch_size,
confluence_client=self.confluence_client,
index_origin=self.index_origin,
index_recursively=self.index_recursively,
)

return self.recursive_indexer.get_pages(start_ind, batch_size)
if self.index_recursively:
return self.recursive_indexer.get_pages(start_ind, batch_size)
else:
return self.recursive_indexer.get_origin_page()

pages: list[dict[str, Any]] = []

Expand Down
19 changes: 12 additions & 7 deletions web/src/lib/connectors/connectors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,13 @@ export const connectorConfigs: Record<ValidSources, ConnectionConfiguration> = {
},
confluence: {
description: "Configure Confluence connector",
subtext: `Specify any link to a Confluence page below and click "Index" to Index. Based on the provided link, we will index either the entire page and its subpages OR the entire space. For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page's children (and optionally, itself). Use the checkbox below to determine whether or not to index the parent page in addition to its children.
subtext: `Specify any link to a Confluence page below and click "Index" to Index. If the provided link is for an entire space, we will index the entire space. However, if you want to index a specific page, you can do so by entering the page's URL.
For example, entering https://danswer.atlassian.net/wiki/spaces/Engineering/overview and clicking the Index button will index the whole Engineering Confluence space, but entering https://danswer.atlassian.net/wiki/spaces/Engineering/pages/164331/example+page will index that page (and optionally the page's children).
We pull the latest pages and comments from each space listed below every 10 minutes`,
Selecting the "Index Recursively" checkbox will index the single page's children in addition to itself.
We pull the latest pages and comments from each space every 10 minutes`,
values: [
{
type: "text",
Expand All @@ -232,10 +236,11 @@ We pull the latest pages and comments from each space listed below every 10 minu
},
{
type: "checkbox",
query: "(For pages) Index the page itself",
label: "(For pages) Index the page itself",
name: "index_origin",
optional: true,
query: "Should index pages recursively?",
label:
"Index Recursively (if this is set and the Wiki Page URL leads to a page, we will index the page and all of its children instead of just the page)",
name: "index_recursively",
optional: false,
},
],
},
Expand Down Expand Up @@ -811,7 +816,7 @@ export interface BookstackConfig {}

export interface ConfluenceConfig {
wiki_page_url: string;
index_origin?: boolean;
index_recursively?: boolean;
}

export interface JiraConfig {
Expand Down

0 comments on commit e6a92aa

Please sign in to comment.