diff --git a/connectors/src/connectors/confluence/temporal/activities.ts b/connectors/src/connectors/confluence/temporal/activities.ts index 1ecf78b37026..c2fba762ac66 100644 --- a/connectors/src/connectors/confluence/temporal/activities.ts +++ b/connectors/src/connectors/confluence/temporal/activities.ts @@ -28,6 +28,7 @@ import { } from "@connectors/connectors/confluence/temporal/utils"; import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; import { concurrentExecutor } from "@connectors/lib/async_utils"; +import type { UpsertToDataSourceParams } from "@connectors/lib/data_sources"; import { deleteFromDataSource, renderDocumentTitleAndContent, @@ -220,7 +221,83 @@ export async function markPageHasVisited({ ); } -async function upsertConfluencePageInDb( +export async function upsertConfluencePageToDataSource( + page: NonNullable>>, + spaceName: string, + confluenceConfig: ConfluenceConfiguration, + syncType: UpsertToDataSourceParams["upsertContext"]["sync_type"], + dataSourceConfig: DataSourceConfig, + loggerArgs: Record +) { + const localLogger = logger.child(loggerArgs); + + const markdown = turndownService.turndown(page.body.storage.value); + const pageCreatedAt = new Date(page.createdAt); + const lastPageVersionCreatedAt = new Date(page.version.createdAt); + + if (markdown) { + const renderedMarkdown = await renderMarkdownSection( + dataSourceConfig, + markdown + ); + const renderedPage = await renderDocumentTitleAndContent({ + dataSourceConfig, + title: `Page ${page.title} Space ${spaceName}`, + createdAt: pageCreatedAt, + updatedAt: lastPageVersionCreatedAt, + content: renderedMarkdown, + }); + + const documentId = makeConfluencePageId(page.id); + const documentUrl = makeConfluenceDocumentUrl({ + baseUrl: confluenceConfig.url, + suffix: page._links.tinyui, + }); + + // We log the number of labels to help define the importance of labels in the future. + if (page.labels.results.length > 0) { + localLogger.info( + { labelsCount: page.labels.results.length }, + "Confluence page has labels." + ); + } + + // Limit to 10 custom tags. + const customTags = page.labels.results + .slice(0, 10) + .map((l) => `labels:${l.id}`); + + const tags = [ + `createdAt:${pageCreatedAt.getTime()}`, + `space:${spaceName}`, + `title:${page.title}`, + `updatedAt:${lastPageVersionCreatedAt.getTime()}`, + `version:${page.version.number}`, + ...customTags, + ]; + + await upsertToDatasource({ + dataSourceConfig, + documentContent: renderedPage, + documentId, + documentUrl, + loggerArgs, + // Parent Ids will be computed after all page imports within the space have been completed. + // TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan + parents: [documentId, makeConfluenceInternalPageId(page.id)], + tags, + timestampMs: lastPageVersionCreatedAt.getTime(), + upsertContext: { + sync_type: syncType, + }, + title: page.title, + mimeType: "application/vnd.dust.confluence.page", + async: true, + }); + } +} + +export async function upsertConfluencePageInDb( connectorId: ModelId, page: ConfluencePageWithBodyType, visitedAtMs: number @@ -335,70 +412,14 @@ export async function confluenceCheckAndUpsertPageActivity({ localLogger.info("Upserting Confluence page."); - const markdown = turndownService.turndown(page.body.storage.value); - const pageCreatedAt = new Date(page.createdAt); - const lastPageVersionCreatedAt = new Date(page.version.createdAt); - - if (markdown) { - const renderedMarkdown = await renderMarkdownSection( - dataSourceConfig, - markdown - ); - const renderedPage = await renderDocumentTitleAndContent({ - dataSourceConfig, - title: `Page ${page.title} Space ${spaceName}`, - createdAt: pageCreatedAt, - updatedAt: lastPageVersionCreatedAt, - content: renderedMarkdown, - }); - - const documentId = makeConfluencePageId(pageId); - const documentUrl = makeConfluenceDocumentUrl({ - baseUrl: confluenceConfig.url, - suffix: page._links.tinyui, - }); - - // We log the number of labels to help define the importance of labels in the future. - if (page.labels.results.length > 0) { - localLogger.info( - { labelsCount: page.labels.results.length }, - "Confluence page has labels." - ); - } - - // Limit to 10 custom tags. - const customTags = page.labels.results - .slice(0, 10) - .map((l) => `labels:${l.id}`); - - const tags = [ - `createdAt:${pageCreatedAt.getTime()}`, - `space:${spaceName}`, - `title:${page.title}`, - `updatedAt:${lastPageVersionCreatedAt.getTime()}`, - `version:${page.version.number}`, - ...customTags, - ]; - - await upsertToDatasource({ - dataSourceConfig, - documentContent: renderedPage, - documentId, - documentUrl, - loggerArgs, - // Parent Ids will be computed after all page imports within the space have been completed. - // TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan - parents: [documentId, makeConfluenceInternalPageId(pageId)], - tags, - timestampMs: lastPageVersionCreatedAt.getTime(), - upsertContext: { - sync_type: isBatchSync ? "batch" : "incremental", - }, - title: page.title, - mimeType: "application/vnd.dust.confluence.page", - async: true, - }); - } + await upsertConfluencePageToDataSource( + page, + spaceName, + confluenceConfig, + isBatchSync ? "batch" : "incremental", + dataSourceConfig, + loggerArgs + ); localLogger.info("Upserting Confluence page in DB."); diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index 07c6b028045f..0f25e86f417d 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -8,15 +8,18 @@ import type { CoreAPITable, PostDataSourceDocumentRequestBody, } from "@dust-tt/types"; -import { isValidDate, safeSubstring, sectionFullText } from "@dust-tt/types"; -import { MAX_CHUNK_SIZE } from "@dust-tt/types"; +import { + isValidDate, + MAX_CHUNK_SIZE, + safeSubstring, + sectionFullText, +} from "@dust-tt/types"; import type { AxiosError, AxiosRequestConfig, AxiosResponse } from "axios"; import axios from "axios"; import tracer from "dd-trace"; import http from "http"; import https from "https"; -import type { Branded } from "io-ts"; -import type { IntBrand } from "io-ts"; +import type { Branded, IntBrand } from "io-ts"; import { fromMarkdown } from "mdast-util-from-markdown"; import { gfmFromMarkdown, gfmToMarkdown } from "mdast-util-gfm"; import { toMarkdown } from "mdast-util-to-markdown"; @@ -58,7 +61,7 @@ type UpsertContext = { sync_type: "batch" | "incremental"; }; -type UpsertToDataSourceParams = { +export type UpsertToDataSourceParams = { dataSourceConfig: DataSourceConfig; documentId: string; documentContent: CoreAPIDataSourceDocumentSection;