diff --git a/connectors/migrations/20230906_3_github_fill_parents_field.ts b/connectors/migrations/20230906_3_github_fill_parents_field.ts new file mode 100644 index 000000000000..df42b9f210de --- /dev/null +++ b/connectors/migrations/20230906_3_github_fill_parents_field.ts @@ -0,0 +1,104 @@ +import { + getDiscussionDocumentId, + getIssueDocumentId, +} from "@connectors/connectors/github/temporal/activities"; +import { updateDocumentParentsField } from "@connectors/lib/data_sources"; +import { + Connector, + GithubDiscussion, + GithubIssue, +} from "@connectors/lib/models"; + +async function main() { + if (!process.argv[2]) { + console.error("Missing workspace id or 'all' as first argument"); + process.exit(1); + } + // if first arg is "all", update all connectors, else update only the + // connector for the corresponding workspace id + const connectors = + process.argv[2] === "all" + ? await Connector.findAll({ + where: { + type: "github", + }, + }) + : await Connector.findAll({ + where: { + type: "github", + workspaceId: process.argv[2], + }, + }); + + for (const connector of connectors) { + console.log(`Updating parents field for connector ${connector.id}`); + await updateDiscussionsParentsFieldForConnector(connector); + await updateIssuesParentsFieldForConnector(connector); + } +} + +async function updateDiscussionsParentsFieldForConnector(connector: Connector) { + // get all distinct documentIds and their channel ids from slack messages in + // this connector + const documentData = await GithubDiscussion.findAll({ + where: { + connectorId: connector.id, + }, + attributes: ["repoId", "discussionNumber"], + }); + // update all parents fields for discussions by chunks of 128 + const chunkSize = 32; + for (let i = 0; i < documentData.length; i += chunkSize) { + const chunk = documentData.slice(i, i + chunkSize); + console.log(`Updating ${chunk.length} documents`); + // update parents field for each document of the chunk, in parallel + await Promise.all( + chunk.map(async (document) => { + const docId = getDiscussionDocumentId( + document.repoId, + document.discussionNumber + ); + await updateDocumentParentsField(connector, docId, [ + getDiscussionDocumentId(document.repoId, document.discussionNumber), + document.repoId, + ]); + }) + ); + } +} + +async function updateIssuesParentsFieldForConnector(connector: Connector) { + // get all distinct issues and their repo ids fro + const documentData = await GithubIssue.findAll({ + where: { + connectorId: connector.id, + }, + attributes: ["repoId", "issueNumber"], + }); + // update all parents fields for all issues by chunks of 128 + const chunkSize = 32; + for (let i = 0; i < documentData.length; i += chunkSize) { + const chunk = documentData.slice(i, i + chunkSize); + console.log(`Updating ${chunk.length} documents`); + // update parents field for each document of the chunk, in parallel + await Promise.all( + chunk.map(async (document) => { + const docId = getIssueDocumentId(document.repoId, document.issueNumber); + await updateDocumentParentsField(connector, docId, [ + getIssueDocumentId(document.repoId, document.issueNumber), + document.repoId, + ]); + }) + ); + } +} + +main() + .then(() => { + console.log("Done"); + process.exit(0); + }) + .catch((err) => { + console.error(err); + process.exit(1); + }); diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index 6fc6295f3c2c..c97c7723eb24 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -154,7 +154,16 @@ export async function githubUpsertIssueActivity( documentUrl: issue.url, timestampMs: lastUpdateTimestamp, tags: tags, - parents: [], + // The convention for parents is to use the external id string; it is ok for + // repos, but not practical for issues since the external id is the + // issue number, which is not guaranteed unique in the workspace. + // Therefore as a special case we use getIssueDocumentId() to get a parent string + // The repo id from github is globally unique so used as-is, as per + // convention to use the external id string. + parents: [ + getIssueDocumentId(repoId.toString(), issue.number), + repoId.toString(), + ], retries: 3, delayBetweenRetriesMs: 500, loggerArgs: { ...loggerArgs, provider: "github" }, @@ -281,7 +290,16 @@ export async function githubUpsertDiscussionActivity( documentUrl: discussion.url, timestampMs: new Date(discussion.createdAt).getTime(), tags, - parents: [], + // The convention for parents is to use the external id string; it is ok for + // repos, but not practical for discussions since the external id is the + // issue number, which is not guaranteed unique in the workspace. Therefore + // as a special case we use getDiscussionDocumentId() to get a parent string + // The repo id from github is globally unique so used as-is, as per + // convention to use the external id string. + parents: [ + getDiscussionDocumentId(repoId.toString(), discussionNumber), + repoId.toString(), + ], retries: 3, delayBetweenRetriesMs: 500, loggerArgs: { ...loggerArgs, provider: "github" }, @@ -585,11 +603,14 @@ function renderGithubUser(user: GithubUser | null): string { return `@${user.id}`; } -function getIssueDocumentId(repoId: string, issueNumber: number): string { +export function getIssueDocumentId( + repoId: string, + issueNumber: number +): string { return `github-issue-${repoId}-${issueNumber}`; } -function getDiscussionDocumentId( +export function getDiscussionDocumentId( repoId: string, discussionNumber: number ): string { diff --git a/connectors/src/types/resources.ts b/connectors/src/types/resources.ts index 47f7b7e47d9d..f4d8771f335d 100644 --- a/connectors/src/types/resources.ts +++ b/connectors/src/types/resources.ts @@ -15,8 +15,19 @@ export type ConnectorResourceType = "file" | "folder" | "database" | "channel"; * - Slack: channels * - GoogleDrive: shared drive or sub-folders of shared drives. * - * `internalId` and `parentInternalId` are internal opaque identifiers that should enable - * reconstructing the tree structure of the resources. + * `internalId` and `parentInternalId` are internal opaque identifiers that + * should enable reconstructing the tree structure of the resources. + * + * Those ids must be aligned with those used in the "parents" field of data + * sources documents, to enable search filter on documents based on their + * parents, see the + * + * The convention to use for internal ids are to always use the externally + * provided id when possible (e.g. Notion page id, Github repository id, + * etc...). When not possible, such as for Github issues whose id is not + * workspace-unique, a custom function to create a unique id is created, and + * used both in the parents field management code and the connectors resource + * code. */ export type ConnectorResource = { provider: ConnectorProvider; diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index 3f0ad1338d48..388c1230f050 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -95,19 +95,25 @@ pub struct Chunk { /// The "parents" field is an array of ids of parents to the document, /// corresponding to its hierarchy, ordered by closest parent first. /// -/// At index 0 is the document id itself, then at index 1 its direct parent, -/// then at index 2 is the direct parent of the element represented at index 1, -/// etc. It is assumed that a document (or folder, or hierarchical level) only -/// has at most one direct parent. Therefore, there is an unambiguous mapping -/// between the parents array and the document's hierarchical position. For -/// example, for a regular file system (or filesystem-like such as Google -/// Drive), each parent would correspond to a subfolder in the path to the -/// document. +/// Parents are at the time of writing only relevant for managed datasources +/// since standard datasources do not allow specifying a hierarchy. A parent is +/// represented by a string of characters which correspond to the parent's +/// external id, provided by the managed datasource’s API (e.g. the Notion id +/// for Notion pages and databases). +/// +/// At index 0 is the string id of the document itself, then at index 1 its +/// direct parent, then at index 2 is the direct parent of the element +/// represented at index 1, etc. It is assumed that a document (or folder, or +/// hierarchical level) only has at most one direct parent. Therefore, there is +/// an unambiguous mapping between the parents array and the document's +/// hierarchical position. For example, for a regular file system (or +/// filesystem-like such as Google Drive), each parent would correspond to a +/// subfolder in the path to the document. /// -/// The document’s id is stored in the field, since the field is used in -/// filtering search to search only parts of the hierarchy: it is natural that -/// if the document’s id is selected as a parent filter, the document itself -/// shows up in the search. +/// The id of the document itself is stored at index 0 because the field is used +/// in filtering search to search only parts of the hierarchy: it is natural +/// that if the document’s id is selected as a parent filter, the document +/// itself shows up in the search. /// /// /// Note however that the hierarchical system depends on the managed datasource. @@ -116,11 +122,9 @@ pub struct Chunk { /// no slack ids for our slack "documents" so the only value in the parents /// array is the slack channel id /// -/// Parents are at the time of writing only relevant for managed datasources -/// since standard datasources do not allow specifying a hierarchy. A parent is -/// represented by a string of characters which correspond to the parent's -/// internal id (specific to the managed datasource)--not its name--provided by -/// the managed datasource. +/// For github, github issues / discussions do not have a proper external id, so +/// we use our computed document id. The repo is considered a parent, and has a +/// proper external “repo id”, which is stored at 2nd place in the array #[derive(Debug, Serialize, Clone)] pub struct Document {