From 7c0707a8a9c99a4a81a8a06be89402738235bb83 Mon Sep 17 00:00:00 2001 From: Aubin <60398825+aubin-tchoi@users.noreply.github.com> Date: Thu, 12 Dec 2024 10:28:43 +0100 Subject: [PATCH] [KWSearch] Double Confluence parent IDs (#9274) * upsert both the old and new parents for confluence pages * add migration script * fix borrow errors * fix borrow errors * clone the qdrant_clients for concurrent use * pass store as a ref * clone instead of passing refs for the store * remove migration script for confluence * fix: make the upsert consistent with the updateDocumentParents --- .../src/connectors/confluence/lib/internal_ids.ts | 15 +++++++++++++++ .../connectors/confluence/temporal/activities.ts | 14 +++++++++++--- .../src/connectors/confluence/temporal/utils.ts | 4 ++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/connectors/src/connectors/confluence/lib/internal_ids.ts b/connectors/src/connectors/confluence/lib/internal_ids.ts index 9b68d9f78767..91e53f102f8a 100644 --- a/connectors/src/connectors/confluence/lib/internal_ids.ts +++ b/connectors/src/connectors/confluence/lib/internal_ids.ts @@ -1,3 +1,8 @@ +import { + makeConfluencePageId, + makeConfluenceSpaceId, +} from "@connectors/connectors/confluence/temporal/utils"; + enum ConfluenceInternalIdPrefix { Space = "cspace_", Page = "cpage_", @@ -27,3 +32,13 @@ export function isConfluenceInternalPageId( ): internalId is `${ConfluenceInternalIdPrefix.Page}${string}` { return internalId.startsWith(ConfluenceInternalIdPrefix.Page); } + +export function convertInternalIdToDocumentId(internalId: string): string { + if (isConfluenceInternalPageId(internalId)) { + return makeConfluencePageId(getIdFromConfluenceInternalId(internalId)); + } + if (isConfluenceInternalSpaceId(internalId)) { + return makeConfluenceSpaceId(getIdFromConfluenceInternalId(internalId)); + } + throw new Error(`Invalid internal ID: ${internalId}`); +} diff --git a/connectors/src/connectors/confluence/temporal/activities.ts b/connectors/src/connectors/confluence/temporal/activities.ts index 34370a190c0b..1ecf78b37026 100644 --- a/connectors/src/connectors/confluence/temporal/activities.ts +++ b/connectors/src/connectors/confluence/temporal/activities.ts @@ -18,7 +18,10 @@ import { getConfluencePageParentIds, getSpaceHierarchy, } from "@connectors/connectors/confluence/lib/hierarchy"; -import { makeConfluenceInternalPageId } from "@connectors/connectors/confluence/lib/internal_ids"; +import { + convertInternalIdToDocumentId, + makeConfluenceInternalPageId, +} from "@connectors/connectors/confluence/lib/internal_ids"; import { makeConfluenceDocumentUrl, makeConfluencePageId, @@ -384,7 +387,8 @@ export async function confluenceCheckAndUpsertPageActivity({ documentUrl, loggerArgs, // Parent Ids will be computed after all page imports within the space have been completed. - parents: [makeConfluenceInternalPageId(documentId)], + // TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan + parents: [documentId, makeConfluenceInternalPageId(pageId)], tags, timestampMs: lastPageVersionCreatedAt.getTime(), upsertContext: { @@ -572,7 +576,11 @@ export async function confluenceUpdatePagesParentIdsActivity( await updateDocumentParentsField({ dataSourceConfig: dataSourceConfigFromConnector(connector), documentId: makeConfluencePageId(page.pageId), - parents: parentIds, + // TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan + parents: [ + ...parentIds, + ...parentIds.map(convertInternalIdToDocumentId), + ], }); }, { concurrency: 10 } diff --git a/connectors/src/connectors/confluence/temporal/utils.ts b/connectors/src/connectors/confluence/temporal/utils.ts index 84e91e4ea196..c5bb78f8f02b 100644 --- a/connectors/src/connectors/confluence/temporal/utils.ts +++ b/connectors/src/connectors/confluence/temporal/utils.ts @@ -34,6 +34,10 @@ export function makeConfluencePageId(pageId: string) { return `confluence-page-${pageId}`; } +export function makeConfluenceSpaceId(spaceId: string) { + return `confluence-space-${spaceId}`; +} + export function makeConfluenceDocumentUrl({ baseUrl, suffix,