diff --git a/connectors/migrations/20241212_backfill_github_data_source_folders.ts b/connectors/migrations/20241212_backfill_github_data_source_folders.ts new file mode 100644 index 000000000000..977085680e7b --- /dev/null +++ b/connectors/migrations/20241212_backfill_github_data_source_folders.ts @@ -0,0 +1,212 @@ +import { getGithubCodeDirectoryParentIds } from "@connectors/connectors/github/lib/hierarchy"; +import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; +import { upsertFolderNode } from "@connectors/lib/data_sources"; +import { + GithubCodeDirectory, + GithubCodeFile, + GithubCodeRepository, + GithubDiscussion, + GithubIssue, +} from "@connectors/lib/models/github"; +import { ConnectorModel } from "@connectors/resources/storage/models/connector_model"; +import type { DataSourceConfig } from "@connectors/types/data_source_config"; + +async function main() { + const connectors = await ConnectorModel.findAll({ + where: { + type: "github", + }, + }); + + for (const connector of connectors) { + const dataSourceConfig = dataSourceConfigFromConnector(connector); + const repositories = await GithubCodeRepository.findAll({ + where: { + connectorId: connector.id, + }, + }); + + for (const repository of repositories) { + // upsert repository as folder + // Throws if error + const repoFolderId = await upsertRepositoryFolderNode( + repository, + dataSourceConfig + ); + + // Upsert Code folder if we have some (file or directory) + const shouldCreateCodeFolder = await repositoryContainsCode(repository); + if (shouldCreateCodeFolder) { + await upsertCodeFolderNode( + repository.repoId, + repoFolderId, + dataSourceConfig + ); + } + + const directories = await GithubCodeDirectory.findAll({ + where: { + connectorId: connector.id, + }, + }); + // upsert directories as folders, in chunks + for (let i = 0; i < directories.length; i += 16) { + const chunk = directories.slice(i, i + 16); + await Promise.all( + chunk.map(async (directory) => { + await upsertDirectoryFolderNode( + directory, + repository.id, + dataSourceConfig + ); + }) + ); + } + + // Upsert issue folder if we have issues + const shouldCreateIssueFolder = + await repositoryContainsIssues(repository); + if (shouldCreateIssueFolder) { + await upsertIssueFolderNode( + repository.repoId, + repoFolderId, + dataSourceConfig + ); + } + + // Upsert discussion folder if we have discussions + const shouldCreateDiscussionFolder = + await repositoryContainsDiscussions(repository); + if (shouldCreateDiscussionFolder) { + await upsertDiscussionFolderNode( + repository.repoId, + repoFolderId, + dataSourceConfig + ); + } + } + } +} + +async function upsertRepositoryFolderNode( + repository: GithubCodeRepository, + dataSourceConfig: DataSourceConfig +) { + const repoFolderId = repository.repoId; + await upsertFolderNode({ + dataSourceConfig, + folderId: repoFolderId, + parents: [repoFolderId], + title: repository.repoName, + }); + return repoFolderId; +} + +async function upsertCodeFolderNode( + repositoryId: string, + repositoryNodeId: string, + dataSourceConfig: DataSourceConfig +) { + const codeFolderId = `github-code-${repositoryId}`; + await upsertFolderNode({ + dataSourceConfig, + folderId: codeFolderId, + parents: [codeFolderId, repositoryNodeId], + title: "Code", + }); + return codeFolderId; +} + +async function upsertDirectoryFolderNode( + directory: GithubCodeDirectory, + repositoryId: number, + dataSourceConfig: DataSourceConfig +) { + // This already contains IDs for Code and Repository folders + const parents = await getGithubCodeDirectoryParentIds( + directory.connectorId, + directory.internalId, + repositoryId + ); + await upsertFolderNode({ + dataSourceConfig, + folderId: directory.internalId, + parents: [directory.internalId, ...parents], + title: directory.dirName, + }); +} + +async function upsertIssueFolderNode( + repositoryId: string, + repositoryNodeId: string, + dataSourceConfig: DataSourceConfig +) { + const issuesFolderId = `${repositoryId}-issues`; + await upsertFolderNode({ + dataSourceConfig, + folderId: issuesFolderId, + parents: [issuesFolderId, repositoryNodeId], + title: "Issues", + }); +} + +async function upsertDiscussionFolderNode( + repositoryId: string, + repositoryNodeId: string, + dataSourceConfig: DataSourceConfig +) { + const discussionsFolderId = `${repositoryId}-discussions`; + await upsertFolderNode({ + dataSourceConfig, + folderId: discussionsFolderId, + parents: [discussionsFolderId, repositoryNodeId], + title: "Discussions", + }); +} + +async function repositoryContainsCode(repository: GithubCodeRepository) { + const directory = await GithubCodeDirectory.findOne({ + where: { + connectorId: repository.connectorId, + repoId: repository.repoId, + }, + }); + if (directory) { + return true; + } + const file = await GithubCodeFile.findOne({ + where: { + connectorId: repository.connectorId, + repoId: repository.repoId, + }, + }); + return !!file; +} + +async function repositoryContainsIssues(repository: GithubCodeRepository) { + const issue = await GithubIssue.findOne({ + where: { + repoId: repository.repoId, + }, + }); + return !!issue; +} + +async function repositoryContainsDiscussions(repository: GithubCodeRepository) { + const discussion = await GithubDiscussion.findOne({ + where: { + repoId: repository.repoId, + }, + }); + return !!discussion; +} + +main() + .then(() => { + console.log("Done"); + process.exit(0); + }) + .catch((err) => { + console.error(err); + process.exit(1); + }); diff --git a/connectors/src/connectors/github/lib/hierarchy.ts b/connectors/src/connectors/github/lib/hierarchy.ts index 1dcc4440c70a..893dfc179cec 100644 --- a/connectors/src/connectors/github/lib/hierarchy.ts +++ b/connectors/src/connectors/github/lib/hierarchy.ts @@ -19,7 +19,7 @@ export async function getGithubCodeOrDirectoryParentIds( return []; } -async function getGithubCodeDirectoryParentIds( +export async function getGithubCodeDirectoryParentIds( connectorId: ModelId, internalId: string, repoId: number diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index 1de0d13aa8cc..2391c0b6f722 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -27,9 +27,11 @@ import { newWebhookSignal } from "@connectors/connectors/github/temporal/signals import { getCodeSyncWorkflowId } from "@connectors/connectors/github/temporal/utils"; import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; import { + deleteFolderNode, deleteFromDataSource, renderDocumentTitleAndContent, renderMarkdownSection, + upsertFolderNode, upsertToDatasource, } from "@connectors/lib/data_sources"; import { ExternalOAuthTokenError } from "@connectors/lib/error"; @@ -307,6 +309,14 @@ export async function githubUpsertIssueActivity( issueNumber, connectorId: connector.id, }); + + // Also upsert the Issue folder node + await upsertFolderNode({ + dataSourceConfig, + folderId: `${repoId}-issues`, + title: "Issues", + parents: [`${repoId}-issues`, repoId.toString()], + }); } async function renderDiscussion( @@ -493,6 +503,14 @@ export async function githubUpsertDiscussionActivity( discussionNumber: discussionNumber, connectorId: connector.id, }); + + // Also upsert the Discussion folder node + await upsertFolderNode({ + dataSourceConfig, + folderId: `${repoId}-discussions`, + title: "Discussions", + parents: [`${repoId}-discussions`, repoId.toString()], + }); } export async function githubGetRepoDiscussionsResultPageActivity( @@ -630,6 +648,12 @@ export async function githubRepoGarbageCollectActivity( ); } + // Delete the Issues folder node + await deleteFolderNode({ + dataSourceConfig, + folderId: `${repoId}-issues`, + }); + const discussionsInRepo = await GithubDiscussion.findAll({ where: { repoId, @@ -651,6 +675,12 @@ export async function githubRepoGarbageCollectActivity( ); } + // Delete the Discussion folder node + await deleteFolderNode({ + dataSourceConfig, + folderId: `${repoId}-discussions`, + }); + await Promise.all(promises); await garbageCollectCodeSync( @@ -668,6 +698,12 @@ export async function githubRepoGarbageCollectActivity( repoId: repoId.toString(), }, }); + + // Delete the Repository folder node + await deleteFolderNode({ + dataSourceConfig, + folderId: repoId, + }); } async function deleteIssue( @@ -846,7 +882,24 @@ async function garbageCollectCodeSync( }, }, }); + // Also delete folder nodes + const fq = new PQueue({ concurrency: 8 }); + directoriesToDelete.forEach((d) => + fq.add(async () => { + Context.current().heartbeat(); + await deleteFolderNode({ + dataSourceConfig, + folderId: d.internalId, + }); + }) + ); } + + // Delete the Code folder + await deleteFolderNode({ + dataSourceConfig, + folderId: `github-code-${repoId}`, + }); } export async function githubCodeSyncActivity({ @@ -910,6 +963,12 @@ export async function githubCodeSyncActivity({ }, }); + // Delete the Repository folder too + await deleteFolderNode({ + dataSourceConfig, + folderId: repoId.toString(), + }); + return; } @@ -941,6 +1000,14 @@ export async function githubCodeSyncActivity({ githubCodeRepository.lastSeenAt = codeSyncStartedAt; await githubCodeRepository.save(); + // Create the Repository folder node + await upsertFolderNode({ + dataSourceConfig, + folderId: githubCodeRepository.repoId, + title: githubCodeRepository.repoName, + parents: [githubCodeRepository.repoId], + }); + logger.info( { repoId, @@ -991,6 +1058,12 @@ export async function githubCodeSyncActivity({ }, }); + // Delete the Repository folder too + await deleteFolderNode({ + dataSourceConfig, + folderId: repoId.toString(), + }); + return; } @@ -1167,6 +1240,19 @@ export async function githubCodeSyncActivity({ }); } + // Create the folder node + await upsertFolderNode({ + dataSourceConfig, + folderId: d.internalId, + parents: [ + d.internalId, + ...d.parents, + `github-code-${repoId.toString()}`, + repoId.toString(), + ], + title: d.dirName, + }); + // If the parents have updated then the internalId gets updated as well so we should never // have an udpate to parentInternalId. We check that this is always the case. If the // directory is moved (the parents change) then it will trigger the creation of a new @@ -1206,6 +1292,16 @@ export async function githubCodeSyncActivity({ logger.child({ task: "garbageCollectCodeSync" }) ); + // Create the Code folder node. + if (files.length > 0 || directories.length > 0) { + await upsertFolderNode({ + dataSourceConfig, + folderId: `github-code-${repoId}`, + title: "Code", + parents: [`github-code-${repoId}`, repoId.toString()], + }); + } + // Finally we update the repository updatedAt value. if (repoUpdatedAt) { githubCodeRepository.codeUpdatedAt = repoUpdatedAt; diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index cd9dfda01b74..e9dd20e2099d 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -1162,7 +1162,6 @@ export async function deleteFolderNode({ }: { dataSourceConfig: DataSourceConfig; folderId: string; - loggerArgs?: Record; }) { const r = await getDustAPI(dataSourceConfig).deleteFolder( dataSourceConfig.dataSourceId,