Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[connectors] update Confluence to use new internalIDs in content nodes and upsertions #9337

Merged
merged 2 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 17 additions & 18 deletions connectors/src/connectors/confluence/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ import {
getSpaceHierarchy,
} from "@connectors/connectors/confluence/lib/hierarchy";
import {
getIdFromConfluenceInternalId,
isConfluenceInternalPageId,
isConfluenceInternalSpaceId,
makeConfluenceInternalSpaceId,
getConfluenceIdFromInternalId,
isInternalPageId,
isInternalSpaceId,
makeSpaceInternalId,
} from "@connectors/connectors/confluence/lib/internal_ids";
import {
checkPageHasChildren,
Expand All @@ -40,8 +40,10 @@ import type {
CreateConnectorErrorCode,
UpdateConnectorErrorCode,
} from "@connectors/connectors/interface";
import { ConnectorManagerError } from "@connectors/connectors/interface";
import { BaseConnectorManager } from "@connectors/connectors/interface";
import {
BaseConnectorManager,
ConnectorManagerError,
} from "@connectors/connectors/interface";
import { concurrentExecutor } from "@connectors/lib/async_utils";
import {
ConfluenceConfiguration,
Expand Down Expand Up @@ -338,7 +340,7 @@ export class ConfluenceConnectorManager extends BaseConnectorManager<null> {
const addedSpaceIds = [];
const removedSpaceIds = [];
for (const [internalId, permission] of Object.entries(permissions)) {
const confluenceId = getIdFromConfluenceInternalId(internalId);
const confluenceId = getConfluenceIdFromInternalId(internalId);
if (permission === "none") {
await ConfluenceSpace.destroy({
where: {
Expand Down Expand Up @@ -412,10 +414,10 @@ export class ConfluenceConnectorManager extends BaseConnectorManager<null> {
const pageIds: string[] = [];

internalIds.forEach((internalId) => {
if (isConfluenceInternalSpaceId(internalId)) {
spaceIds.push(getIdFromConfluenceInternalId(internalId));
} else if (isConfluenceInternalPageId(internalId)) {
pageIds.push(getIdFromConfluenceInternalId(internalId));
if (isInternalSpaceId(internalId)) {
spaceIds.push(getConfluenceIdFromInternalId(internalId));
} else if (isInternalPageId(internalId)) {
pageIds.push(getConfluenceIdFromInternalId(internalId));
}
});

Expand Down Expand Up @@ -479,12 +481,12 @@ export class ConfluenceConnectorManager extends BaseConnectorManager<null> {
internalId: string;
memoizationKey?: string;
}): Promise<Result<string[], Error>> {
if (isConfluenceInternalSpaceId(internalId)) {
if (isInternalSpaceId(internalId)) {
return new Ok([internalId]);
}

if (isConfluenceInternalPageId(internalId)) {
const confluenceId = getIdFromConfluenceInternalId(internalId);
if (isInternalPageId(internalId)) {
const confluenceId = getConfluenceIdFromInternalId(internalId);
const currentPage = await ConfluencePage.findOne({
attributes: ["pageId", "parentId", "spaceId"],
where: {
Expand All @@ -499,10 +501,7 @@ export class ConfluenceConnectorManager extends BaseConnectorManager<null> {

// If the page does not have a parentId, return only the spaceId.
if (!currentPage.parentId) {
return new Ok([
internalId,
makeConfluenceInternalSpaceId(currentPage.spaceId),
]);
return new Ok([internalId, makeSpaceInternalId(currentPage.spaceId)]);
}

// if a memoization key is provided, use it to cache the hierarchy which
Expand Down
10 changes: 5 additions & 5 deletions connectors/src/connectors/confluence/lib/hierarchy.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import type { ModelId } from "@dust-tt/types";

import {
makeConfluenceInternalPageId,
makeConfluenceInternalSpaceId,
makePageInternalId,
makeSpaceInternalId,
} from "@connectors/connectors/confluence/lib/internal_ids";
import { ConfluencePage } from "@connectors/lib/models/confluence";

Expand Down Expand Up @@ -68,9 +68,9 @@ export async function getConfluencePageParentIds(

return [
// Add the current page.
makeConfluenceInternalPageId(page.pageId),
...parentIds.map((p) => makeConfluenceInternalPageId(p)),
makePageInternalId(page.pageId),
...parentIds.map((p) => makePageInternalId(p)),
// Add the space id at the end.
makeConfluenceInternalSpaceId(page.spaceId),
makeSpaceInternalId(page.spaceId),
];
}
41 changes: 18 additions & 23 deletions connectors/src/connectors/confluence/lib/internal_ids.ts
Original file line number Diff line number Diff line change
@@ -1,44 +1,39 @@
import {
makeConfluencePageId,
makeConfluenceSpaceId,
} from "@connectors/connectors/confluence/temporal/utils";
/**
Naming conventions:
- Confluence ID: The ID that Confluence uses to identify a space or page, usually just a number.
- Internal ID: The ID we use in content nodes and in data_source_documents (both document_id and parents).
Internal IDs are Confluence IDs with a prefix `confluence-space-` or `confluence-page-`.
*/

enum ConfluenceInternalIdPrefix {
Space = "cspace_",
Page = "cpage_",
Space = "confluence-space-",
Page = "confluence-page",
}

export function makeConfluenceInternalSpaceId(confluenceSpaceId: string) {
export function makeSpaceInternalId(confluenceSpaceId: string) {
return `${ConfluenceInternalIdPrefix.Space}${confluenceSpaceId}`;
}

export function makeConfluenceInternalPageId(confluencePageId: string) {
export function makePageInternalId(confluencePageId: string) {
return `${ConfluenceInternalIdPrefix.Page}${confluencePageId}`;
}

export function getIdFromConfluenceInternalId(internalId: string) {
const prefixPattern = `^(${ConfluenceInternalIdPrefix.Space}|${ConfluenceInternalIdPrefix.Page})`;
return internalId.replace(new RegExp(prefixPattern), "");
export function getConfluenceIdFromInternalId(internalId: string) {
if (isInternalPageId(internalId) || isInternalSpaceId(internalId)) {
const prefixPattern = `^(${ConfluenceInternalIdPrefix.Space}|${ConfluenceInternalIdPrefix.Page})`;
return internalId.replace(new RegExp(prefixPattern), "");
}
throw new Error(`Invalid internal ID: ${internalId}`);
}

export function isConfluenceInternalSpaceId(
export function isInternalSpaceId(
internalId: string
): internalId is `${ConfluenceInternalIdPrefix.Space}${string}` {
return internalId.startsWith(ConfluenceInternalIdPrefix.Space);
}

export function isConfluenceInternalPageId(
export function isInternalPageId(
internalId: string
): internalId is `${ConfluenceInternalIdPrefix.Page}${string}` {
return internalId.startsWith(ConfluenceInternalIdPrefix.Page);
}

export function convertInternalIdToDocumentId(internalId: string): string {
if (isConfluenceInternalPageId(internalId)) {
return makeConfluencePageId(getIdFromConfluenceInternalId(internalId));
}
if (isConfluenceInternalSpaceId(internalId)) {
return makeConfluenceSpaceId(getIdFromConfluenceInternalId(internalId));
}
throw new Error(`Invalid internal ID: ${internalId}`);
}
26 changes: 13 additions & 13 deletions connectors/src/connectors/confluence/lib/permissions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ import { Op } from "sequelize";
import { listConfluenceSpaces } from "@connectors/connectors/confluence/lib/confluence_api";
import type { ConfluenceSpaceType } from "@connectors/connectors/confluence/lib/confluence_client";
import {
getIdFromConfluenceInternalId,
isConfluenceInternalPageId,
isConfluenceInternalSpaceId,
makeConfluenceInternalPageId,
makeConfluenceInternalSpaceId,
getConfluenceIdFromInternalId,
isInternalPageId,
isInternalSpaceId,
makePageInternalId,
makeSpaceInternalId,
} from "@connectors/connectors/confluence/lib/internal_ids";
import type { ConfluenceConfiguration } from "@connectors/lib/models/confluence";
import {
Expand Down Expand Up @@ -47,7 +47,7 @@ export function createContentNodeFromSpace(

return {
provider: "confluence",
internalId: makeConfluenceInternalSpaceId(spaceId),
internalId: makeSpaceInternalId(spaceId),
parentInternalId: null,
type: "folder",
title: space.name || "Unnamed Space",
Expand All @@ -67,11 +67,11 @@ export function createContentNodeFromPage(
): ContentNode {
return {
provider: "confluence",
internalId: makeConfluenceInternalPageId(page.pageId),
internalId: makePageInternalId(page.pageId),
parentInternalId:
parent.type === "space"
? makeConfluenceInternalSpaceId(parent.id)
: makeConfluenceInternalPageId(parent.id),
? makeSpaceInternalId(parent.id)
: makePageInternalId(parent.id),
type: "file",
title: page.title,
sourceUrl: `${baseUrl}/wiki${page.externalUrl}`,
Expand Down Expand Up @@ -102,7 +102,7 @@ async function getSynchronizedSpaces(
confluenceConfig: ConfluenceConfiguration,
parentInternalId: string
): Promise<Result<ContentNode[], Error>> {
const confluenceId = getIdFromConfluenceInternalId(parentInternalId);
const confluenceId = getConfluenceIdFromInternalId(parentInternalId);

const parentSpace = await ConfluenceSpace.findOne({
attributes: ["id", "spaceId"],
Expand Down Expand Up @@ -149,7 +149,7 @@ async function getSynchronizedChildrenPages(
confluenceConfig: ConfluenceConfiguration,
parentInternalId: string
): Promise<Result<ContentNode[], Error>> {
const confluenceId = getIdFromConfluenceInternalId(parentInternalId);
const confluenceId = getConfluenceIdFromInternalId(parentInternalId);

const parentPage = await ConfluencePage.findOne({
attributes: ["id", "pageId"],
Expand Down Expand Up @@ -196,7 +196,7 @@ export async function retrieveHierarchyForParent(
const { id: connectorId } = connector;

if (parentInternalId) {
if (isConfluenceInternalSpaceId(parentInternalId)) {
if (isInternalSpaceId(parentInternalId)) {
const resources = await getSynchronizedSpaces(
connectorId,
confluenceConfig,
Expand All @@ -208,7 +208,7 @@ export async function retrieveHierarchyForParent(
}

return new Ok(resources.value);
} else if (isConfluenceInternalPageId(parentInternalId)) {
} else if (isInternalPageId(parentInternalId)) {
const resources = await getSynchronizedChildrenPages(
connectorId,
confluenceConfig,
Expand Down
25 changes: 7 additions & 18 deletions connectors/src/connectors/confluence/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,8 @@ import {
getConfluencePageParentIds,
getSpaceHierarchy,
} from "@connectors/connectors/confluence/lib/hierarchy";
import {
convertInternalIdToDocumentId,
makeConfluenceInternalPageId,
} from "@connectors/connectors/confluence/lib/internal_ids";
import {
makeConfluenceDocumentUrl,
makeConfluencePageId,
} from "@connectors/connectors/confluence/temporal/utils";
import { makePageInternalId } from "@connectors/connectors/confluence/lib/internal_ids";
import { makeConfluenceDocumentUrl } from "@connectors/connectors/confluence/temporal/workflow_ids";
import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config";
import { concurrentExecutor } from "@connectors/lib/async_utils";
import {
Expand Down Expand Up @@ -352,7 +346,7 @@ export async function confluenceCheckAndUpsertPageActivity({
content: renderedMarkdown,
});

const documentId = makeConfluencePageId(pageId);
const documentId = makePageInternalId(pageId);
const documentUrl = makeConfluenceDocumentUrl({
baseUrl: confluenceConfig.url,
suffix: page._links.tinyui,
Expand Down Expand Up @@ -387,8 +381,7 @@ export async function confluenceCheckAndUpsertPageActivity({
documentUrl,
loggerArgs,
// Parent Ids will be computed after all page imports within the space have been completed.
// TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan
parents: [documentId, makeConfluenceInternalPageId(pageId)],
parents: [documentId],
tags,
timestampMs: lastPageVersionCreatedAt.getTime(),
upsertContext: {
Expand Down Expand Up @@ -575,12 +568,8 @@ export async function confluenceUpdatePagesParentIdsActivity(

await updateDocumentParentsField({
dataSourceConfig: dataSourceConfigFromConnector(connector),
documentId: makeConfluencePageId(page.pageId),
// TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan
parents: [
...parentIds,
...parentIds.map(convertInternalIdToDocumentId),
],
documentId: makePageInternalId(page.pageId),
parents: parentIds,
});
},
{ concurrency: 10 }
Expand Down Expand Up @@ -631,7 +620,7 @@ async function deletePage(

const localLogger = logger.child(loggerArgs);

const documentId = makeConfluencePageId(pageId);
const documentId = makePageInternalId(pageId);
localLogger.info(
{ documentId },
"Deleting Confluence page from Dust data source."
Expand Down
2 changes: 1 addition & 1 deletion connectors/src/connectors/confluence/temporal/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import { spaceUpdatesSignal } from "@connectors/connectors/confluence/temporal/s
import {
makeConfluencePersonalDataWorkflowId,
makeConfluenceRemoveSpacesWorkflowId,
} from "@connectors/connectors/confluence/temporal/utils";
} from "@connectors/connectors/confluence/temporal/workflow_ids";
import {
confluencePersonalDataReportingWorkflow,
confluenceRemoveSpacesWorkflow,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,6 @@ export function makeConfluencePersonalDataWorkflowId() {
return `confluence-personal-data-reporting`;
}

export function makeConfluencePageId(pageId: string) {
// Omit space reference in the ID to accommodate Confluence pages moving between spaces.
return `confluence-page-${pageId}`;
}

export function makeConfluenceSpaceId(spaceId: string) {
return `confluence-space-${spaceId}`;
}

export function makeConfluenceDocumentUrl({
baseUrl,
suffix,
Expand Down
2 changes: 1 addition & 1 deletion connectors/src/connectors/confluence/temporal/workflows.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import {
makeConfluenceRemoveSpaceWorkflowIdFromParentId,
makeConfluenceSpaceSyncWorkflowIdFromParentId,
makeConfluenceSyncTopLevelChildPagesWorkflowIdFromParentId,
} from "@connectors/connectors/confluence/temporal/utils";
} from "@connectors/connectors/confluence/temporal/workflow_ids";

const {
confluenceGetSpaceNameActivity,
Expand Down
Loading