Skip to content

Commit

Permalink
refactor: move page upsertion into a dedicated function
Browse files Browse the repository at this point in the history
  • Loading branch information
aubin-tchoi committed Dec 13, 2024
1 parent 28f735d commit 157fe74
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 70 deletions.
151 changes: 86 additions & 65 deletions connectors/src/connectors/confluence/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import {
} from "@connectors/connectors/confluence/temporal/utils";
import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config";
import { concurrentExecutor } from "@connectors/lib/async_utils";
import type { UpsertToDataSourceParams } from "@connectors/lib/data_sources";
import {
deleteFromDataSource,
renderDocumentTitleAndContent,
Expand Down Expand Up @@ -220,7 +221,83 @@ export async function markPageHasVisited({
);
}

async function upsertConfluencePageInDb(
export async function upsertConfluencePageToDataSource(
page: NonNullable<Awaited<ReturnType<ConfluenceClient["getPageById"]>>>,
spaceName: string,
confluenceConfig: ConfluenceConfiguration,
syncType: UpsertToDataSourceParams["upsertContext"]["sync_type"],
dataSourceConfig: DataSourceConfig,
loggerArgs: Record<string, string | number>
) {
const localLogger = logger.child(loggerArgs);

const markdown = turndownService.turndown(page.body.storage.value);
const pageCreatedAt = new Date(page.createdAt);
const lastPageVersionCreatedAt = new Date(page.version.createdAt);

if (markdown) {
const renderedMarkdown = await renderMarkdownSection(
dataSourceConfig,
markdown
);
const renderedPage = await renderDocumentTitleAndContent({
dataSourceConfig,
title: `Page ${page.title} Space ${spaceName}`,
createdAt: pageCreatedAt,
updatedAt: lastPageVersionCreatedAt,
content: renderedMarkdown,
});

const documentId = makeConfluencePageId(page.id);
const documentUrl = makeConfluenceDocumentUrl({
baseUrl: confluenceConfig.url,
suffix: page._links.tinyui,
});

// We log the number of labels to help define the importance of labels in the future.
if (page.labels.results.length > 0) {
localLogger.info(
{ labelsCount: page.labels.results.length },
"Confluence page has labels."
);
}

// Limit to 10 custom tags.
const customTags = page.labels.results
.slice(0, 10)
.map((l) => `labels:${l.id}`);

const tags = [
`createdAt:${pageCreatedAt.getTime()}`,
`space:${spaceName}`,
`title:${page.title}`,
`updatedAt:${lastPageVersionCreatedAt.getTime()}`,
`version:${page.version.number}`,
...customTags,
];

await upsertToDatasource({
dataSourceConfig,
documentContent: renderedPage,
documentId,
documentUrl,
loggerArgs,
// Parent Ids will be computed after all page imports within the space have been completed.
// TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan
parents: [documentId, makeConfluenceInternalPageId(page.id)],
tags,
timestampMs: lastPageVersionCreatedAt.getTime(),
upsertContext: {
sync_type: syncType,
},
title: page.title,
mimeType: "application/vnd.dust.confluence.page",
async: true,
});
}
}

export async function upsertConfluencePageInDb(
connectorId: ModelId,
page: ConfluencePageWithBodyType,
visitedAtMs: number
Expand Down Expand Up @@ -335,70 +412,14 @@ export async function confluenceCheckAndUpsertPageActivity({

localLogger.info("Upserting Confluence page.");

const markdown = turndownService.turndown(page.body.storage.value);
const pageCreatedAt = new Date(page.createdAt);
const lastPageVersionCreatedAt = new Date(page.version.createdAt);

if (markdown) {
const renderedMarkdown = await renderMarkdownSection(
dataSourceConfig,
markdown
);
const renderedPage = await renderDocumentTitleAndContent({
dataSourceConfig,
title: `Page ${page.title} Space ${spaceName}`,
createdAt: pageCreatedAt,
updatedAt: lastPageVersionCreatedAt,
content: renderedMarkdown,
});

const documentId = makeConfluencePageId(pageId);
const documentUrl = makeConfluenceDocumentUrl({
baseUrl: confluenceConfig.url,
suffix: page._links.tinyui,
});

// We log the number of labels to help define the importance of labels in the future.
if (page.labels.results.length > 0) {
localLogger.info(
{ labelsCount: page.labels.results.length },
"Confluence page has labels."
);
}

// Limit to 10 custom tags.
const customTags = page.labels.results
.slice(0, 10)
.map((l) => `labels:${l.id}`);

const tags = [
`createdAt:${pageCreatedAt.getTime()}`,
`space:${spaceName}`,
`title:${page.title}`,
`updatedAt:${lastPageVersionCreatedAt.getTime()}`,
`version:${page.version.number}`,
...customTags,
];

await upsertToDatasource({
dataSourceConfig,
documentContent: renderedPage,
documentId,
documentUrl,
loggerArgs,
// Parent Ids will be computed after all page imports within the space have been completed.
// TODO(2024-12-11 aubin): we upsert parents x2 (old and new), this is the first step of the backfill plan
parents: [documentId, makeConfluenceInternalPageId(pageId)],
tags,
timestampMs: lastPageVersionCreatedAt.getTime(),
upsertContext: {
sync_type: isBatchSync ? "batch" : "incremental",
},
title: page.title,
mimeType: "application/vnd.dust.confluence.page",
async: true,
});
}
await upsertConfluencePageToDataSource(
page,
spaceName,
confluenceConfig,
isBatchSync ? "batch" : "incremental",
dataSourceConfig,
loggerArgs
);

localLogger.info("Upserting Confluence page in DB.");

Expand Down
13 changes: 8 additions & 5 deletions connectors/src/lib/data_sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,18 @@ import type {
CoreAPITable,
PostDataSourceDocumentRequestBody,
} from "@dust-tt/types";
import { isValidDate, safeSubstring, sectionFullText } from "@dust-tt/types";
import { MAX_CHUNK_SIZE } from "@dust-tt/types";
import {
isValidDate,
MAX_CHUNK_SIZE,
safeSubstring,
sectionFullText,
} from "@dust-tt/types";
import type { AxiosError, AxiosRequestConfig, AxiosResponse } from "axios";
import axios from "axios";
import tracer from "dd-trace";
import http from "http";
import https from "https";
import type { Branded } from "io-ts";
import type { IntBrand } from "io-ts";
import type { Branded, IntBrand } from "io-ts";
import { fromMarkdown } from "mdast-util-from-markdown";
import { gfmFromMarkdown, gfmToMarkdown } from "mdast-util-gfm";
import { toMarkdown } from "mdast-util-to-markdown";
Expand Down Expand Up @@ -58,7 +61,7 @@ type UpsertContext = {
sync_type: "batch" | "incremental";
};

type UpsertToDataSourceParams = {
export type UpsertToDataSourceParams = {
dataSourceConfig: DataSourceConfig;
documentId: string;
documentContent: CoreAPIDataSourceDocumentSection;
Expand Down

0 comments on commit 157fe74

Please sign in to comment.