Skip to content

Commit

Permalink
refactor: use new IDs everywhere (content nodes + upsertions)
Browse files Browse the repository at this point in the history
  • Loading branch information
aubin-tchoi committed Dec 12, 2024
1 parent a9deb8d commit 2e7fcc2
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 87 deletions.
35 changes: 17 additions & 18 deletions connectors/src/connectors/confluence/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ import {
getSpaceHierarchy,
} from "@connectors/connectors/confluence/lib/hierarchy";
import {
getIdFromConfluenceInternalId,
isConfluenceInternalPageId,
isConfluenceInternalSpaceId,
makeConfluenceInternalSpaceId,
getConfluenceIdFromInternalId,
isInternalPageId,
isInternalSpaceId,
makeSpaceInternalId,
} from "@connectors/connectors/confluence/lib/internal_ids";
import {
checkPageHasChildren,
Expand All @@ -40,8 +40,10 @@ import type {
CreateConnectorErrorCode,
UpdateConnectorErrorCode,
} from "@connectors/connectors/interface";
import { ConnectorManagerError } from "@connectors/connectors/interface";
import { BaseConnectorManager } from "@connectors/connectors/interface";
import {
BaseConnectorManager,
ConnectorManagerError,
} from "@connectors/connectors/interface";
import { concurrentExecutor } from "@connectors/lib/async_utils";
import {
ConfluenceConfiguration,
Expand Down Expand Up @@ -338,7 +340,7 @@ export class ConfluenceConnectorManager extends BaseConnectorManager<null> {
const addedSpaceIds = [];
const removedSpaceIds = [];
for (const [internalId, permission] of Object.entries(permissions)) {
const confluenceId = getIdFromConfluenceInternalId(internalId);
const confluenceId = getConfluenceIdFromInternalId(internalId);
if (permission === "none") {
await ConfluenceSpace.destroy({
where: {
Expand Down Expand Up @@ -412,10 +414,10 @@ export class ConfluenceConnectorManager extends BaseConnectorManager<null> {
const pageIds: string[] = [];

internalIds.forEach((internalId) => {
if (isConfluenceInternalSpaceId(internalId)) {
spaceIds.push(getIdFromConfluenceInternalId(internalId));
} else if (isConfluenceInternalPageId(internalId)) {
pageIds.push(getIdFromConfluenceInternalId(internalId));
if (isInternalSpaceId(internalId)) {
spaceIds.push(getConfluenceIdFromInternalId(internalId));
} else if (isInternalPageId(internalId)) {
pageIds.push(getConfluenceIdFromInternalId(internalId));
}
});

Expand Down Expand Up @@ -479,12 +481,12 @@ export class ConfluenceConnectorManager extends BaseConnectorManager<null> {
internalId: string;
memoizationKey?: string;
}): Promise<Result<string[], Error>> {
if (isConfluenceInternalSpaceId(internalId)) {
if (isInternalSpaceId(internalId)) {
return new Ok([internalId]);
}

if (isConfluenceInternalPageId(internalId)) {
const confluenceId = getIdFromConfluenceInternalId(internalId);
if (isInternalPageId(internalId)) {
const confluenceId = getConfluenceIdFromInternalId(internalId);
const currentPage = await ConfluencePage.findOne({
attributes: ["pageId", "parentId", "spaceId"],
where: {
Expand All @@ -499,10 +501,7 @@ export class ConfluenceConnectorManager extends BaseConnectorManager<null> {

// If the page does not have a parentId, return only the spaceId.
if (!currentPage.parentId) {
return new Ok([
internalId,
makeConfluenceInternalSpaceId(currentPage.spaceId),
]);
return new Ok([internalId, makeSpaceInternalId(currentPage.spaceId)]);
}

// if a memoization key is provided, use it to cache the hierarchy which
Expand Down
10 changes: 5 additions & 5 deletions connectors/src/connectors/confluence/lib/hierarchy.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import type { ModelId } from "@dust-tt/types";

import {
makeConfluenceInternalPageId,
makeConfluenceInternalSpaceId,
makePageInternalId,
makeSpaceInternalId,
} from "@connectors/connectors/confluence/lib/internal_ids";
import { ConfluencePage } from "@connectors/lib/models/confluence";

Expand Down Expand Up @@ -68,9 +68,9 @@ export async function getConfluencePageParentIds(

return [
// Add the current page.
makeConfluenceInternalPageId(page.pageId),
...parentIds.map((p) => makeConfluenceInternalPageId(p)),
makePageInternalId(page.pageId),
...parentIds.map((p) => makePageInternalId(p)),
// Add the space id at the end.
makeConfluenceInternalSpaceId(page.spaceId),
makeSpaceInternalId(page.spaceId),
];
}
41 changes: 18 additions & 23 deletions connectors/src/connectors/confluence/lib/internal_ids.ts
Original file line number Diff line number Diff line change
@@ -1,44 +1,39 @@
import {
makeConfluencePageId,
makeConfluenceSpaceId,
} from "@connectors/connectors/confluence/temporal/utils";
/**
Naming conventions:
- Confluence ID: The ID that Confluence uses to identify a space or page, usually just a number.
- Internal ID: The ID we use in content nodes and in data_source_documents (both document_id and parents).
Internal IDs are Confluence IDs with a prefix `confluence-space-` or `confluence-page-`.
*/

enum ConfluenceInternalIdPrefix {
Space = "cspace_",
Page = "cpage_",
Space = "confluence-space-",
Page = "confluence-page",
}

export function makeConfluenceInternalSpaceId(confluenceSpaceId: string) {
export function makeSpaceInternalId(confluenceSpaceId: string) {
return `${ConfluenceInternalIdPrefix.Space}${confluenceSpaceId}`;
}

export function makeConfluenceInternalPageId(confluencePageId: string) {
export function makePageInternalId(confluencePageId: string) {
return `${ConfluenceInternalIdPrefix.Page}${confluencePageId}`;
}

export function getIdFromConfluenceInternalId(internalId: string) {
const prefixPattern = `^(${ConfluenceInternalIdPrefix.Space}|${ConfluenceInternalIdPrefix.Page})`;
return internalId.replace(new RegExp(prefixPattern), "");
export function getConfluenceIdFromInternalId(internalId: string) {
if (isInternalPageId(internalId) || isInternalSpaceId(internalId)) {
const prefixPattern = `^(${ConfluenceInternalIdPrefix.Space}|${ConfluenceInternalIdPrefix.Page})`;
return internalId.replace(new RegExp(prefixPattern), "");
}
throw new Error(`Invalid internal ID: ${internalId}`);
}

export function isConfluenceInternalSpaceId(
export function isInternalSpaceId(
internalId: string
): internalId is `${ConfluenceInternalIdPrefix.Space}${string}` {
return internalId.startsWith(ConfluenceInternalIdPrefix.Space);
}

export function isConfluenceInternalPageId(
export function isInternalPageId(
internalId: string
): internalId is `${ConfluenceInternalIdPrefix.Page}${string}` {
return internalId.startsWith(ConfluenceInternalIdPrefix.Page);
}

export function convertInternalIdToDocumentId(internalId: string): string {
if (isConfluenceInternalPageId(internalId)) {
return makeConfluencePageId(getIdFromConfluenceInternalId(internalId));
}
if (isConfluenceInternalSpaceId(internalId)) {
return makeConfluenceSpaceId(getIdFromConfluenceInternalId(internalId));
}
throw new Error(`Invalid internal ID: ${internalId}`);
}
26 changes: 13 additions & 13 deletions connectors/src/connectors/confluence/lib/permissions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ import { Op } from "sequelize";
import { listConfluenceSpaces } from "@connectors/connectors/confluence/lib/confluence_api";
import type { ConfluenceSpaceType } from "@connectors/connectors/confluence/lib/confluence_client";
import {
getIdFromConfluenceInternalId,
isConfluenceInternalPageId,
isConfluenceInternalSpaceId,
makeConfluenceInternalPageId,
makeConfluenceInternalSpaceId,
getConfluenceIdFromInternalId,
isInternalPageId,
isInternalSpaceId,
makePageInternalId,
makeSpaceInternalId,
} from "@connectors/connectors/confluence/lib/internal_ids";
import type { ConfluenceConfiguration } from "@connectors/lib/models/confluence";
import {
Expand Down Expand Up @@ -47,7 +47,7 @@ export function createContentNodeFromSpace(

return {
provider: "confluence",
internalId: makeConfluenceInternalSpaceId(spaceId),
internalId: makeSpaceInternalId(spaceId),
parentInternalId: null,
type: "folder",
title: space.name || "Unnamed Space",
Expand All @@ -67,11 +67,11 @@ export function createContentNodeFromPage(
): ContentNode {
return {
provider: "confluence",
internalId: makeConfluenceInternalPageId(page.pageId),
internalId: makePageInternalId(page.pageId),
parentInternalId:
parent.type === "space"
? makeConfluenceInternalSpaceId(parent.id)
: makeConfluenceInternalPageId(parent.id),
? makeSpaceInternalId(parent.id)
: makePageInternalId(parent.id),
type: "file",
title: page.title,
sourceUrl: `${baseUrl}/wiki${page.externalUrl}`,
Expand Down Expand Up @@ -102,7 +102,7 @@ async function getSynchronizedSpaces(
confluenceConfig: ConfluenceConfiguration,
parentInternalId: string
): Promise<Result<ContentNode[], Error>> {
const confluenceId = getIdFromConfluenceInternalId(parentInternalId);
const confluenceId = getConfluenceIdFromInternalId(parentInternalId);

const parentSpace = await ConfluenceSpace.findOne({
attributes: ["id", "spaceId"],
Expand Down Expand Up @@ -149,7 +149,7 @@ async function getSynchronizedChildrenPages(
confluenceConfig: ConfluenceConfiguration,
parentInternalId: string
): Promise<Result<ContentNode[], Error>> {
const confluenceId = getIdFromConfluenceInternalId(parentInternalId);
const confluenceId = getConfluenceIdFromInternalId(parentInternalId);

const parentPage = await ConfluencePage.findOne({
attributes: ["id", "pageId"],
Expand Down Expand Up @@ -196,7 +196,7 @@ export async function retrieveHierarchyForParent(
const { id: connectorId } = connector;

if (parentInternalId) {
if (isConfluenceInternalSpaceId(parentInternalId)) {
if (isInternalSpaceId(parentInternalId)) {
const resources = await getSynchronizedSpaces(
connectorId,
confluenceConfig,
Expand All @@ -208,7 +208,7 @@ export async function retrieveHierarchyForParent(
}

return new Ok(resources.value);
} else if (isConfluenceInternalPageId(parentInternalId)) {
} else if (isInternalPageId(parentInternalId)) {
const resources = await getSynchronizedChildrenPages(
connectorId,
confluenceConfig,
Expand Down
25 changes: 7 additions & 18 deletions connectors/src/connectors/confluence/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,8 @@ import {
getConfluencePageParentIds,
getSpaceHierarchy,
} from "@connectors/connectors/confluence/lib/hierarchy";
import {
convertInternalIdToDocumentId,
makeConfluenceInternalPageId,
} from "@connectors/connectors/confluence/lib/internal_ids";
import {
makeConfluenceDocumentUrl,
makeConfluencePageId,
} from "@connectors/connectors/confluence/temporal/utils";
import { makePageInternalId } from "@connectors/connectors/confluence/lib/internal_ids";
import { makeConfluenceDocumentUrl } from "@connectors/connectors/confluence/temporal/workflow_ids";
import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config";
import { concurrentExecutor } from "@connectors/lib/async_utils";
import {
Expand Down Expand Up @@ -352,7 +346,7 @@ export async function confluenceCheckAndUpsertPageActivity({
content: renderedMarkdown,
});

const documentId = makeConfluencePageId(pageId);
const documentId = makePageInternalId(pageId);
const documentUrl = makeConfluenceDocumentUrl({
baseUrl: confluenceConfig.url,
suffix: page._links.tinyui,
Expand Down Expand Up @@ -387,8 +381,7 @@ export async function confluenceCheckAndUpsertPageActivity({
documentUrl,
loggerArgs,
// Parent Ids will be computed after all page imports within the space have been completed.
// TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan
parents: [documentId, makeConfluenceInternalPageId(pageId)],
parents: [documentId],
tags,
timestampMs: lastPageVersionCreatedAt.getTime(),
upsertContext: {
Expand Down Expand Up @@ -575,12 +568,8 @@ export async function confluenceUpdatePagesParentIdsActivity(

await updateDocumentParentsField({
dataSourceConfig: dataSourceConfigFromConnector(connector),
documentId: makeConfluencePageId(page.pageId),
// TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan
parents: [
...parentIds,
...parentIds.map(convertInternalIdToDocumentId),
],
documentId: makePageInternalId(page.pageId),
parents: parentIds,
});
},
{ concurrency: 10 }
Expand Down Expand Up @@ -631,7 +620,7 @@ async function deletePage(

const localLogger = logger.child(loggerArgs);

const documentId = makeConfluencePageId(pageId);
const documentId = makePageInternalId(pageId);
localLogger.info(
{ documentId },
"Deleting Confluence page from Dust data source."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,6 @@ export function makeConfluencePersonalDataWorkflowId() {
return `confluence-personal-data-reporting`;
}

export function makeConfluencePageId(pageId: string) {
// Omit space reference in the ID to accommodate Confluence pages moving between spaces.
return `confluence-page-${pageId}`;
}

export function makeConfluenceSpaceId(spaceId: string) {
return `confluence-space-${spaceId}`;
}

export function makeConfluenceDocumentUrl({
baseUrl,
suffix,
Expand Down
2 changes: 1 addition & 1 deletion connectors/src/connectors/confluence/temporal/workflows.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import {
makeConfluenceRemoveSpaceWorkflowIdFromParentId,
makeConfluenceSpaceSyncWorkflowIdFromParentId,
makeConfluenceSyncTopLevelChildPagesWorkflowIdFromParentId,
} from "@connectors/connectors/confluence/temporal/utils";
} from "@connectors/connectors/confluence/temporal/workflow_ids";

const {
confluenceGetSpaceNameActivity,
Expand Down

0 comments on commit 2e7fcc2

Please sign in to comment.