From d931d7b3f96150dabfbf7f60615e6d3f6905eb64 Mon Sep 17 00:00:00 2001 From: Lucas Date: Tue, 10 Dec 2024 15:28:25 +0100 Subject: [PATCH 01/10] first pass: upserting and deleting folders --- .../connectors/github/temporal/activities.ts | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index 1de0d13aa8cc..e7a516bb248e 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -31,6 +31,8 @@ import { renderDocumentTitleAndContent, renderMarkdownSection, upsertToDatasource, + deleteFolderNode, + upsertFolderNode, } from "@connectors/lib/data_sources"; import { ExternalOAuthTokenError } from "@connectors/lib/error"; import { @@ -668,6 +670,12 @@ export async function githubRepoGarbageCollectActivity( repoId: repoId.toString(), }, }); + + await deleteFolderNode({ + dataSourceConfig, + folderId: repoId, + loggerArgs: logger.bindings(), + }); } async function deleteIssue( @@ -846,6 +854,19 @@ async function garbageCollectCodeSync( }, }, }); + + // Also delete data source folders + const fq = new PQueue({ concurrency: 8 }); + directoriesToDelete.forEach((d) => + fq.add(async () => { + Context.current().heartbeat(); + await deleteFolderNode({ + dataSourceConfig, + folderId: d.internalId, + loggerArgs: logger.bindings(), + }); + }) + ); } } @@ -910,6 +931,12 @@ export async function githubCodeSyncActivity({ }, }); + await deleteFolderNode({ + dataSourceConfig, + folderId: repoId.toString(), + loggerArgs: logger.bindings(), + }); + return; } @@ -941,6 +968,14 @@ export async function githubCodeSyncActivity({ githubCodeRepository.lastSeenAt = codeSyncStartedAt; await githubCodeRepository.save(); + // Add as dataSource folder + await upsertFolderNode({ + dataSourceConfig, + folderId: githubCodeRepository.repoId, + title: githubCodeRepository.repoName, + parents: [githubCodeRepository.repoId], + }); + logger.info( { repoId, @@ -991,6 +1026,12 @@ export async function githubCodeSyncActivity({ }, }); + await deleteFolderNode({ + dataSourceConfig, + folderId: repoId.toString(), + loggerArgs: logger.bindings(), + }); + return; } @@ -1167,6 +1208,13 @@ export async function githubCodeSyncActivity({ }); } + await upsertFolderNode({ + dataSourceConfig, + folderId: d.internalId, + parents: [d.internalId, ...d.parents, repoId.toString()], + title: d.dirName, + }); + // If the parents have updated then the internalId gets updated as well so we should never // have an udpate to parentInternalId. We check that this is always the case. If the // directory is moved (the parents change) then it will trigger the creation of a new From 96656d3e4b863bf8835515c175fff317b11ebee1 Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 11 Dec 2024 10:24:55 +0100 Subject: [PATCH 02/10] Added own id in parents --- connectors/src/connectors/github/lib/hierarchy.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/connectors/src/connectors/github/lib/hierarchy.ts b/connectors/src/connectors/github/lib/hierarchy.ts index 1dcc4440c70a..65f8bc7c3906 100644 --- a/connectors/src/connectors/github/lib/hierarchy.ts +++ b/connectors/src/connectors/github/lib/hierarchy.ts @@ -42,9 +42,9 @@ async function getGithubCodeDirectoryParentIds( directory.parentInternalId, repoId ); - return [directory.parentInternalId, ...parents]; + return [directory.internalId, directory.parentInternalId, ...parents]; } else if (directory.parentInternalId === `github-code-${repoId}`) { - return [`github-code-${repoId}`, `${repoId}`]; + return [directory.internalId, `github-code-${repoId}`, `${repoId}`]; } return []; } @@ -72,9 +72,9 @@ async function getGithubCodeFileParentIds( file.parentInternalId, repoId ); - return [file.parentInternalId, ...parents]; + return [file.documentId, file.parentInternalId, ...parents]; } else if (file.parentInternalId === `github-code-${repoId}`) { - return [`${repoId}`, `github-code-${repoId}`]; + return [file.documentId, `${repoId}`, `github-code-${repoId}`]; } return []; } From 292aa6e40fa7612b4f564a62349088f92d7c277b Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 11 Dec 2024 10:37:15 +0100 Subject: [PATCH 03/10] Applied connector-specific linter --- connectors/src/connectors/github/temporal/activities.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index e7a516bb248e..e1ef9a5d95f0 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -27,12 +27,12 @@ import { newWebhookSignal } from "@connectors/connectors/github/temporal/signals import { getCodeSyncWorkflowId } from "@connectors/connectors/github/temporal/utils"; import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; import { + deleteFolderNode, deleteFromDataSource, renderDocumentTitleAndContent, renderMarkdownSection, - upsertToDatasource, - deleteFolderNode, upsertFolderNode, + upsertToDatasource, } from "@connectors/lib/data_sources"; import { ExternalOAuthTokenError } from "@connectors/lib/error"; import { From 4a24e677155e733d58359096cd28432b5711defa Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 11 Dec 2024 11:04:54 +0100 Subject: [PATCH 04/10] Fixing hierarchy --- connectors/src/connectors/github/lib/hierarchy.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/connectors/src/connectors/github/lib/hierarchy.ts b/connectors/src/connectors/github/lib/hierarchy.ts index 65f8bc7c3906..7e1adb344ab5 100644 --- a/connectors/src/connectors/github/lib/hierarchy.ts +++ b/connectors/src/connectors/github/lib/hierarchy.ts @@ -42,9 +42,9 @@ async function getGithubCodeDirectoryParentIds( directory.parentInternalId, repoId ); - return [directory.internalId, directory.parentInternalId, ...parents]; + return [directory.parentInternalId, ...parents]; } else if (directory.parentInternalId === `github-code-${repoId}`) { - return [directory.internalId, `github-code-${repoId}`, `${repoId}`]; + return [`github-code-${repoId}`, `${repoId}`]; } return []; } From d9fadcf9dbf3bba6bc281e452a1ef9bce6883526 Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 11 Dec 2024 14:35:20 +0100 Subject: [PATCH 05/10] Handling Issues, Discussion and Code nodes --- .../connectors/github/temporal/activities.ts | 55 +++++++++++++++++-- .../connectors/github/temporal/workflows.ts | 38 +++++++++++++ connectors/src/lib/data_sources.ts | 1 - 3 files changed, 87 insertions(+), 7 deletions(-) diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index e1ef9a5d95f0..acbe29c55fc7 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -309,6 +309,14 @@ export async function githubUpsertIssueActivity( issueNumber, connectorId: connector.id, }); + + // Also upsert the Issue folder node + await upsertFolderNode({ + dataSourceConfig, + folderId: `${repoId}-issues`, + title: "Issues", + parents: [`github-code-${repoId}`, repoId.toString()], + }); } async function renderDiscussion( @@ -495,6 +503,14 @@ export async function githubUpsertDiscussionActivity( discussionNumber: discussionNumber, connectorId: connector.id, }); + + // Also upsert the Discussion folder node + await upsertFolderNode({ + dataSourceConfig, + folderId: `${repoId}-discussions`, + title: "Discussions", + parents: [`github-code-${repoId}`, repoId.toString()], + }); } export async function githubGetRepoDiscussionsResultPageActivity( @@ -632,6 +648,12 @@ export async function githubRepoGarbageCollectActivity( ); } + // Delete the Issues folder node + await deleteFolderNode({ + dataSourceConfig, + folderId: `${repoId}-issues`, + }); + const discussionsInRepo = await GithubDiscussion.findAll({ where: { repoId, @@ -653,6 +675,12 @@ export async function githubRepoGarbageCollectActivity( ); } + // Delete the Discussion folder node + await deleteFolderNode({ + dataSourceConfig, + folderId: `${repoId}-discussions`, + }); + await Promise.all(promises); await garbageCollectCodeSync( @@ -674,7 +702,6 @@ export async function githubRepoGarbageCollectActivity( await deleteFolderNode({ dataSourceConfig, folderId: repoId, - loggerArgs: logger.bindings(), }); } @@ -854,7 +881,6 @@ async function garbageCollectCodeSync( }, }, }); - // Also delete data source folders const fq = new PQueue({ concurrency: 8 }); directoriesToDelete.forEach((d) => @@ -863,11 +889,16 @@ async function garbageCollectCodeSync( await deleteFolderNode({ dataSourceConfig, folderId: d.internalId, - loggerArgs: logger.bindings(), }); }) ); } + + // Delete the "Code" folder node + await deleteFolderNode({ + dataSourceConfig, + folderId: `github-code-${repoId}`, + }); } export async function githubCodeSyncActivity({ @@ -931,10 +962,10 @@ export async function githubCodeSyncActivity({ }, }); + // Delete the data source folder too await deleteFolderNode({ dataSourceConfig, folderId: repoId.toString(), - loggerArgs: logger.bindings(), }); return; @@ -1029,7 +1060,6 @@ export async function githubCodeSyncActivity({ await deleteFolderNode({ dataSourceConfig, folderId: repoId.toString(), - loggerArgs: logger.bindings(), }); return; @@ -1211,7 +1241,12 @@ export async function githubCodeSyncActivity({ await upsertFolderNode({ dataSourceConfig, folderId: d.internalId, - parents: [d.internalId, ...d.parents, repoId.toString()], + parents: [ + d.internalId, + ...d.parents, + `github-code-${repoId.toString()}`, + repoId.toString(), + ], title: d.dirName, }); @@ -1254,6 +1289,14 @@ export async function githubCodeSyncActivity({ logger.child({ task: "garbageCollectCodeSync" }) ); + // Create the Code folder node. + await upsertFolderNode({ + dataSourceConfig, + folderId: `github-code-${repoId}`, + title: "Code", + parents: [`github-code-${repoId}`, repoId.toString()], + }); + // Finally we update the repository updatedAt value. if (repoUpdatedAt) { githubCodeRepository.codeUpdatedAt = repoUpdatedAt; diff --git a/connectors/src/connectors/github/temporal/workflows.ts b/connectors/src/connectors/github/temporal/workflows.ts index 5e79a411b0b4..cac1a9bce10c 100644 --- a/connectors/src/connectors/github/temporal/workflows.ts +++ b/connectors/src/connectors/github/temporal/workflows.ts @@ -10,6 +10,10 @@ import { import PQueue from "p-queue"; import type * as activities from "@connectors/connectors/github/temporal/activities"; +import { + deleteFolderNode, + upsertFolderNode, +} from "@connectors/lib/data_sources"; import type { DataSourceConfig } from "@connectors/types/data_source_config"; import { newWebhookSignal } from "./signals"; @@ -187,6 +191,23 @@ export async function githubRepoIssuesSyncWorkflow({ { repoId } ); + // Create/Delete data source folder based on whether there are issues or not. + if (pageNumber === 1) { + if (!resultsPage.length) { + await deleteFolderNode({ + dataSourceConfig, + folderId: `${repoId}-issues`, + }); + } else { + await upsertFolderNode({ + dataSourceConfig, + folderId: `${repoId}-issues`, + parents: [`${repoId}-issues`, `${repoId}`], + title: "Issues", + }); + } + } + if (!resultsPage.length) { return false; } @@ -242,6 +263,23 @@ export async function githubRepoDiscussionsSyncWorkflow({ { repoId } ); + // Create/Delete data source folder based on whether there are discussions or not. + if (!nextCursor) { + if (!discussionNumbers.length) { + await deleteFolderNode({ + dataSourceConfig, + folderId: `${repoId}-discussions`, + }); + } else { + await upsertFolderNode({ + dataSourceConfig, + folderId: `${repoId}-discussions`, + parents: [`${repoId}-discussions`, `${repoId}`], + title: "Discussions", + }); + } + } + for (const discussionNumber of discussionNumbers) { promises.push( queue.add(() => diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index cd9dfda01b74..e9dd20e2099d 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -1162,7 +1162,6 @@ export async function deleteFolderNode({ }: { dataSourceConfig: DataSourceConfig; folderId: string; - loggerArgs?: Record; }) { const r = await getDustAPI(dataSourceConfig).deleteFolder( dataSourceConfig.dataSourceId, From 1668c9feaf24890fb165aaaa6b85ad5db7470aa4 Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 11 Dec 2024 15:35:26 +0100 Subject: [PATCH 06/10] Moved logic outside of Workflow --- .../connectors/github/temporal/activities.ts | 25 +++++++----- .../connectors/github/temporal/workflows.ts | 38 ------------------- 2 files changed, 15 insertions(+), 48 deletions(-) diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index acbe29c55fc7..742c2a98d5f1 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -699,6 +699,7 @@ export async function githubRepoGarbageCollectActivity( }, }); + // Delete the Repository folder node await deleteFolderNode({ dataSourceConfig, folderId: repoId, @@ -881,7 +882,7 @@ async function garbageCollectCodeSync( }, }, }); - // Also delete data source folders + // Also delete folder nodes const fq = new PQueue({ concurrency: 8 }); directoriesToDelete.forEach((d) => fq.add(async () => { @@ -894,7 +895,7 @@ async function garbageCollectCodeSync( ); } - // Delete the "Code" folder node + // Delete the Code folder await deleteFolderNode({ dataSourceConfig, folderId: `github-code-${repoId}`, @@ -962,7 +963,7 @@ export async function githubCodeSyncActivity({ }, }); - // Delete the data source folder too + // Delete the Repository folder too await deleteFolderNode({ dataSourceConfig, folderId: repoId.toString(), @@ -999,7 +1000,7 @@ export async function githubCodeSyncActivity({ githubCodeRepository.lastSeenAt = codeSyncStartedAt; await githubCodeRepository.save(); - // Add as dataSource folder + // Create the Repository folder node await upsertFolderNode({ dataSourceConfig, folderId: githubCodeRepository.repoId, @@ -1057,6 +1058,7 @@ export async function githubCodeSyncActivity({ }, }); + // Delete the Repository folder too await deleteFolderNode({ dataSourceConfig, folderId: repoId.toString(), @@ -1238,6 +1240,7 @@ export async function githubCodeSyncActivity({ }); } + // Create the folder node await upsertFolderNode({ dataSourceConfig, folderId: d.internalId, @@ -1290,12 +1293,14 @@ export async function githubCodeSyncActivity({ ); // Create the Code folder node. - await upsertFolderNode({ - dataSourceConfig, - folderId: `github-code-${repoId}`, - title: "Code", - parents: [`github-code-${repoId}`, repoId.toString()], - }); + if (files.length > 0 || directories.length > 0) { + await upsertFolderNode({ + dataSourceConfig, + folderId: `github-code-${repoId}`, + title: "Code", + parents: [`github-code-${repoId}`, repoId.toString()], + }); + } // Finally we update the repository updatedAt value. if (repoUpdatedAt) { diff --git a/connectors/src/connectors/github/temporal/workflows.ts b/connectors/src/connectors/github/temporal/workflows.ts index cac1a9bce10c..5e79a411b0b4 100644 --- a/connectors/src/connectors/github/temporal/workflows.ts +++ b/connectors/src/connectors/github/temporal/workflows.ts @@ -10,10 +10,6 @@ import { import PQueue from "p-queue"; import type * as activities from "@connectors/connectors/github/temporal/activities"; -import { - deleteFolderNode, - upsertFolderNode, -} from "@connectors/lib/data_sources"; import type { DataSourceConfig } from "@connectors/types/data_source_config"; import { newWebhookSignal } from "./signals"; @@ -191,23 +187,6 @@ export async function githubRepoIssuesSyncWorkflow({ { repoId } ); - // Create/Delete data source folder based on whether there are issues or not. - if (pageNumber === 1) { - if (!resultsPage.length) { - await deleteFolderNode({ - dataSourceConfig, - folderId: `${repoId}-issues`, - }); - } else { - await upsertFolderNode({ - dataSourceConfig, - folderId: `${repoId}-issues`, - parents: [`${repoId}-issues`, `${repoId}`], - title: "Issues", - }); - } - } - if (!resultsPage.length) { return false; } @@ -263,23 +242,6 @@ export async function githubRepoDiscussionsSyncWorkflow({ { repoId } ); - // Create/Delete data source folder based on whether there are discussions or not. - if (!nextCursor) { - if (!discussionNumbers.length) { - await deleteFolderNode({ - dataSourceConfig, - folderId: `${repoId}-discussions`, - }); - } else { - await upsertFolderNode({ - dataSourceConfig, - folderId: `${repoId}-discussions`, - parents: [`${repoId}-discussions`, `${repoId}`], - title: "Discussions", - }); - } - } - for (const discussionNumber of discussionNumbers) { promises.push( queue.add(() => From 7c435ae4f285ba2be4ad26c8ad5fa04900e0b9f8 Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 11 Dec 2024 18:06:13 +0100 Subject: [PATCH 07/10] Oversight --- connectors/src/connectors/github/lib/hierarchy.ts | 4 ++-- connectors/src/connectors/github/temporal/activities.ts | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/connectors/src/connectors/github/lib/hierarchy.ts b/connectors/src/connectors/github/lib/hierarchy.ts index 7e1adb344ab5..1dcc4440c70a 100644 --- a/connectors/src/connectors/github/lib/hierarchy.ts +++ b/connectors/src/connectors/github/lib/hierarchy.ts @@ -72,9 +72,9 @@ async function getGithubCodeFileParentIds( file.parentInternalId, repoId ); - return [file.documentId, file.parentInternalId, ...parents]; + return [file.parentInternalId, ...parents]; } else if (file.parentInternalId === `github-code-${repoId}`) { - return [file.documentId, `${repoId}`, `github-code-${repoId}`]; + return [`${repoId}`, `github-code-${repoId}`]; } return []; } diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index 742c2a98d5f1..2391c0b6f722 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -315,7 +315,7 @@ export async function githubUpsertIssueActivity( dataSourceConfig, folderId: `${repoId}-issues`, title: "Issues", - parents: [`github-code-${repoId}`, repoId.toString()], + parents: [`${repoId}-issues`, repoId.toString()], }); } @@ -509,7 +509,7 @@ export async function githubUpsertDiscussionActivity( dataSourceConfig, folderId: `${repoId}-discussions`, title: "Discussions", - parents: [`github-code-${repoId}`, repoId.toString()], + parents: [`${repoId}-discussions`, repoId.toString()], }); } From f48965485edd341a9b3337a52df28cded40cf66e Mon Sep 17 00:00:00 2001 From: Lucas Date: Thu, 12 Dec 2024 15:46:40 +0100 Subject: [PATCH 08/10] Wrote migration script for Github --- ...212_backfill_github_data_source_folders.ts | 124 ++++++++++++++++++ .../src/connectors/github/lib/hierarchy.ts | 2 +- 2 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 connectors/migrations/20241212_backfill_github_data_source_folders.ts diff --git a/connectors/migrations/20241212_backfill_github_data_source_folders.ts b/connectors/migrations/20241212_backfill_github_data_source_folders.ts new file mode 100644 index 000000000000..6bf4589db8d0 --- /dev/null +++ b/connectors/migrations/20241212_backfill_github_data_source_folders.ts @@ -0,0 +1,124 @@ +import { getGithubCodeDirectoryParentIds } from "@connectors/connectors/github/lib/hierarchy"; +import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; +import { upsertFolderNode } from "@connectors/lib/data_sources"; +import { + GithubCodeDirectory, + GithubCodeFile, + GithubCodeRepository, + GithubDiscussion, + GithubIssue, +} from "@connectors/lib/models/github"; +import { ConnectorModel } from "@connectors/resources/storage/models/connector_model"; + +async function main() { + const connectors = await ConnectorModel.findAll({ + where: { + type: "github", + }, + }); + + for (const connector of connectors) { + console.log(`Processing connector ${connector.id}...`); + + const dataSourceConfig = dataSourceConfigFromConnector(connector); + const repositories = await GithubCodeRepository.findAll({ + where: { + connectorId: connector.id, + }, + }); + + for (const repository of repositories) { + // upsert repository as folder + const repoFolderId = repository.repoId; + await upsertFolderNode({ + dataSourceConfig, + folderId: repoFolderId, + parents: [repoFolderId], + title: repository.repoName, + }); + + // Upsert Code folder if we have some (file or directory) + const hasCodeDirectory = await GithubCodeDirectory.findOne({ + where: { + connectorId: connector.id, + repoId: repository.repoId, + }, + }); + const hasCodeFile = await GithubCodeFile.findOne({ + where: { + connectorId: connector.id, + repoId: repository.repoId, + }, + }); + if (hasCodeDirectory || hasCodeFile) { + const codeFolderId = `github-code-${repository.repoId}`; + await upsertFolderNode({ + dataSourceConfig, + folderId: codeFolderId, + parents: [codeFolderId, repoFolderId], + title: "Code", + }); + } + + const directories = await GithubCodeDirectory.findAll({ + where: { + connectorId: connector.id, + }, + }); + // upsert directories as folders, in chunks + for (let i = 0; i < directories.length; i += 16) { + const chunk = directories.slice(i, i + 16); + await Promise.all( + chunk.map(async (directory) => { + // This already contains IDs for Code and Repository folders + const parents = await getGithubCodeDirectoryParentIds( + connector.id, + directory.internalId, + repository.id + ); + await upsertFolderNode({ + dataSourceConfig, + folderId: directory.internalId, + parents: [directory.internalId, ...parents], + title: directory.dirName, + }); + }) + ); + } + + // Upsert issue folder if we have issues + if (await GithubIssue.findOne({ where: { repoId: repository.repoId } })) { + const issuesFolderId = `${repository.repoId}-issues`; + await upsertFolderNode({ + dataSourceConfig, + folderId: issuesFolderId, + parents: [issuesFolderId, repoFolderId], + title: "Issues", + }); + } + + // Upsert discussion folder if we have discussions + if ( + await GithubDiscussion.findOne({ where: { repoId: repository.repoId } }) + ) { + const discussionsFolderId = `${repository.repoId}-discussions`; + await upsertFolderNode({ + dataSourceConfig, + folderId: discussionsFolderId, + parents: [discussionsFolderId, repoFolderId], + title: "Discussions", + }); + } + } + } +} + +main() + .then(() => { + console.log("Done"); + process.exit(0); + }) + .catch((err) => { + console.error(err); + process.exit(1); + }); diff --git a/connectors/src/connectors/github/lib/hierarchy.ts b/connectors/src/connectors/github/lib/hierarchy.ts index 1dcc4440c70a..893dfc179cec 100644 --- a/connectors/src/connectors/github/lib/hierarchy.ts +++ b/connectors/src/connectors/github/lib/hierarchy.ts @@ -19,7 +19,7 @@ export async function getGithubCodeOrDirectoryParentIds( return []; } -async function getGithubCodeDirectoryParentIds( +export async function getGithubCodeDirectoryParentIds( connectorId: ModelId, internalId: string, repoId: number From 3e3c30849954ec5a97cdf6da1c48ed9b39f8803c Mon Sep 17 00:00:00 2001 From: Lucas Date: Thu, 12 Dec 2024 16:28:48 +0100 Subject: [PATCH 09/10] Removed unnecessary logging --- .../migrations/20241212_backfill_github_data_source_folders.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/connectors/migrations/20241212_backfill_github_data_source_folders.ts b/connectors/migrations/20241212_backfill_github_data_source_folders.ts index 6bf4589db8d0..33183c591845 100644 --- a/connectors/migrations/20241212_backfill_github_data_source_folders.ts +++ b/connectors/migrations/20241212_backfill_github_data_source_folders.ts @@ -18,8 +18,6 @@ async function main() { }); for (const connector of connectors) { - console.log(`Processing connector ${connector.id}...`); - const dataSourceConfig = dataSourceConfigFromConnector(connector); const repositories = await GithubCodeRepository.findAll({ where: { From babaeaf41b439f0b05926688126ac0ed3b13225a Mon Sep 17 00:00:00 2001 From: Lucas Date: Thu, 12 Dec 2024 17:21:13 +0100 Subject: [PATCH 10/10] Better concern separation --- ...212_backfill_github_data_source_folders.ts | 202 +++++++++++++----- 1 file changed, 146 insertions(+), 56 deletions(-) diff --git a/connectors/migrations/20241212_backfill_github_data_source_folders.ts b/connectors/migrations/20241212_backfill_github_data_source_folders.ts index 33183c591845..977085680e7b 100644 --- a/connectors/migrations/20241212_backfill_github_data_source_folders.ts +++ b/connectors/migrations/20241212_backfill_github_data_source_folders.ts @@ -9,6 +9,7 @@ import { GithubIssue, } from "@connectors/lib/models/github"; import { ConnectorModel } from "@connectors/resources/storage/models/connector_model"; +import type { DataSourceConfig } from "@connectors/types/data_source_config"; async function main() { const connectors = await ConnectorModel.findAll({ @@ -27,35 +28,20 @@ async function main() { for (const repository of repositories) { // upsert repository as folder - const repoFolderId = repository.repoId; - await upsertFolderNode({ - dataSourceConfig, - folderId: repoFolderId, - parents: [repoFolderId], - title: repository.repoName, - }); + // Throws if error + const repoFolderId = await upsertRepositoryFolderNode( + repository, + dataSourceConfig + ); // Upsert Code folder if we have some (file or directory) - const hasCodeDirectory = await GithubCodeDirectory.findOne({ - where: { - connectorId: connector.id, - repoId: repository.repoId, - }, - }); - const hasCodeFile = await GithubCodeFile.findOne({ - where: { - connectorId: connector.id, - repoId: repository.repoId, - }, - }); - if (hasCodeDirectory || hasCodeFile) { - const codeFolderId = `github-code-${repository.repoId}`; - await upsertFolderNode({ - dataSourceConfig, - folderId: codeFolderId, - parents: [codeFolderId, repoFolderId], - title: "Code", - }); + const shouldCreateCodeFolder = await repositoryContainsCode(repository); + if (shouldCreateCodeFolder) { + await upsertCodeFolderNode( + repository.repoId, + repoFolderId, + dataSourceConfig + ); } const directories = await GithubCodeDirectory.findAll({ @@ -68,49 +54,153 @@ async function main() { const chunk = directories.slice(i, i + 16); await Promise.all( chunk.map(async (directory) => { - // This already contains IDs for Code and Repository folders - const parents = await getGithubCodeDirectoryParentIds( - connector.id, - directory.internalId, - repository.id + await upsertDirectoryFolderNode( + directory, + repository.id, + dataSourceConfig ); - await upsertFolderNode({ - dataSourceConfig, - folderId: directory.internalId, - parents: [directory.internalId, ...parents], - title: directory.dirName, - }); }) ); } // Upsert issue folder if we have issues - if (await GithubIssue.findOne({ where: { repoId: repository.repoId } })) { - const issuesFolderId = `${repository.repoId}-issues`; - await upsertFolderNode({ - dataSourceConfig, - folderId: issuesFolderId, - parents: [issuesFolderId, repoFolderId], - title: "Issues", - }); + const shouldCreateIssueFolder = + await repositoryContainsIssues(repository); + if (shouldCreateIssueFolder) { + await upsertIssueFolderNode( + repository.repoId, + repoFolderId, + dataSourceConfig + ); } // Upsert discussion folder if we have discussions - if ( - await GithubDiscussion.findOne({ where: { repoId: repository.repoId } }) - ) { - const discussionsFolderId = `${repository.repoId}-discussions`; - await upsertFolderNode({ - dataSourceConfig, - folderId: discussionsFolderId, - parents: [discussionsFolderId, repoFolderId], - title: "Discussions", - }); + const shouldCreateDiscussionFolder = + await repositoryContainsDiscussions(repository); + if (shouldCreateDiscussionFolder) { + await upsertDiscussionFolderNode( + repository.repoId, + repoFolderId, + dataSourceConfig + ); } } } } +async function upsertRepositoryFolderNode( + repository: GithubCodeRepository, + dataSourceConfig: DataSourceConfig +) { + const repoFolderId = repository.repoId; + await upsertFolderNode({ + dataSourceConfig, + folderId: repoFolderId, + parents: [repoFolderId], + title: repository.repoName, + }); + return repoFolderId; +} + +async function upsertCodeFolderNode( + repositoryId: string, + repositoryNodeId: string, + dataSourceConfig: DataSourceConfig +) { + const codeFolderId = `github-code-${repositoryId}`; + await upsertFolderNode({ + dataSourceConfig, + folderId: codeFolderId, + parents: [codeFolderId, repositoryNodeId], + title: "Code", + }); + return codeFolderId; +} + +async function upsertDirectoryFolderNode( + directory: GithubCodeDirectory, + repositoryId: number, + dataSourceConfig: DataSourceConfig +) { + // This already contains IDs for Code and Repository folders + const parents = await getGithubCodeDirectoryParentIds( + directory.connectorId, + directory.internalId, + repositoryId + ); + await upsertFolderNode({ + dataSourceConfig, + folderId: directory.internalId, + parents: [directory.internalId, ...parents], + title: directory.dirName, + }); +} + +async function upsertIssueFolderNode( + repositoryId: string, + repositoryNodeId: string, + dataSourceConfig: DataSourceConfig +) { + const issuesFolderId = `${repositoryId}-issues`; + await upsertFolderNode({ + dataSourceConfig, + folderId: issuesFolderId, + parents: [issuesFolderId, repositoryNodeId], + title: "Issues", + }); +} + +async function upsertDiscussionFolderNode( + repositoryId: string, + repositoryNodeId: string, + dataSourceConfig: DataSourceConfig +) { + const discussionsFolderId = `${repositoryId}-discussions`; + await upsertFolderNode({ + dataSourceConfig, + folderId: discussionsFolderId, + parents: [discussionsFolderId, repositoryNodeId], + title: "Discussions", + }); +} + +async function repositoryContainsCode(repository: GithubCodeRepository) { + const directory = await GithubCodeDirectory.findOne({ + where: { + connectorId: repository.connectorId, + repoId: repository.repoId, + }, + }); + if (directory) { + return true; + } + const file = await GithubCodeFile.findOne({ + where: { + connectorId: repository.connectorId, + repoId: repository.repoId, + }, + }); + return !!file; +} + +async function repositoryContainsIssues(repository: GithubCodeRepository) { + const issue = await GithubIssue.findOne({ + where: { + repoId: repository.repoId, + }, + }); + return !!issue; +} + +async function repositoryContainsDiscussions(repository: GithubCodeRepository) { + const discussion = await GithubDiscussion.findOne({ + where: { + repoId: repository.repoId, + }, + }); + return !!discussion; +} + main() .then(() => { console.log("Done");