Skip to content

Commit

Permalink
Github Sync updates github docs parents Field (#1296)
Browse files Browse the repository at this point in the history
* doc update of parents field

* core api to update parents (+ refactor of update tags)

* spolu review

* logging

* review 2

* fixed qdrant payload update via document hash

* WIP

* wip2

* wip 3

* wip 4

* add parents endpoint to front

* cleaning

* removed js Set in parents (semantics don't work)

* cleaning

* remove SyncWorkflowResult

* controlled memoization of getParents

* cleaning

* handle void returns from pqueue execution in notion workflow

* fix: document is of class object at runtime, not notionpage or notiondb

* cleaning

* bump notion workflow version

* pass timestamp to activity execution for better memoization

* henry's suggestions

* migration script

* cleaning

* leaner getParents + fixes on potential inconsistencies

* renaming

* Update parents during sync

* migration script

* comments from pr + adaptations from discussions

* last PR comments

* doc on convention on connector resource
  • Loading branch information
philipperolet authored Sep 8, 2023
1 parent ae30922 commit 7a20e7e
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 23 deletions.
104 changes: 104 additions & 0 deletions connectors/migrations/20230906_3_github_fill_parents_field.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import {
getDiscussionDocumentId,
getIssueDocumentId,
} from "@connectors/connectors/github/temporal/activities";
import { updateDocumentParentsField } from "@connectors/lib/data_sources";
import {
Connector,
GithubDiscussion,
GithubIssue,
} from "@connectors/lib/models";

async function main() {
if (!process.argv[2]) {
console.error("Missing workspace id or 'all' as first argument");
process.exit(1);
}
// if first arg is "all", update all connectors, else update only the
// connector for the corresponding workspace id
const connectors =
process.argv[2] === "all"
? await Connector.findAll({
where: {
type: "github",
},
})
: await Connector.findAll({
where: {
type: "github",
workspaceId: process.argv[2],
},
});

for (const connector of connectors) {
console.log(`Updating parents field for connector ${connector.id}`);
await updateDiscussionsParentsFieldForConnector(connector);
await updateIssuesParentsFieldForConnector(connector);
}
}

async function updateDiscussionsParentsFieldForConnector(connector: Connector) {
// get all distinct documentIds and their channel ids from slack messages in
// this connector
const documentData = await GithubDiscussion.findAll({
where: {
connectorId: connector.id,
},
attributes: ["repoId", "discussionNumber"],
});
// update all parents fields for discussions by chunks of 128
const chunkSize = 32;
for (let i = 0; i < documentData.length; i += chunkSize) {
const chunk = documentData.slice(i, i + chunkSize);
console.log(`Updating ${chunk.length} documents`);
// update parents field for each document of the chunk, in parallel
await Promise.all(
chunk.map(async (document) => {
const docId = getDiscussionDocumentId(
document.repoId,
document.discussionNumber
);
await updateDocumentParentsField(connector, docId, [
getDiscussionDocumentId(document.repoId, document.discussionNumber),
document.repoId,
]);
})
);
}
}

async function updateIssuesParentsFieldForConnector(connector: Connector) {
// get all distinct issues and their repo ids fro
const documentData = await GithubIssue.findAll({
where: {
connectorId: connector.id,
},
attributes: ["repoId", "issueNumber"],
});
// update all parents fields for all issues by chunks of 128
const chunkSize = 32;
for (let i = 0; i < documentData.length; i += chunkSize) {
const chunk = documentData.slice(i, i + chunkSize);
console.log(`Updating ${chunk.length} documents`);
// update parents field for each document of the chunk, in parallel
await Promise.all(
chunk.map(async (document) => {
const docId = getIssueDocumentId(document.repoId, document.issueNumber);
await updateDocumentParentsField(connector, docId, [
getIssueDocumentId(document.repoId, document.issueNumber),
document.repoId,
]);
})
);
}
}

main()
.then(() => {
console.log("Done");
process.exit(0);
})
.catch((err) => {
console.error(err);
process.exit(1);
});
29 changes: 25 additions & 4 deletions connectors/src/connectors/github/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,16 @@ export async function githubUpsertIssueActivity(
documentUrl: issue.url,
timestampMs: lastUpdateTimestamp,
tags: tags,
parents: [],
// The convention for parents is to use the external id string; it is ok for
// repos, but not practical for issues since the external id is the
// issue number, which is not guaranteed unique in the workspace.
// Therefore as a special case we use getIssueDocumentId() to get a parent string
// The repo id from github is globally unique so used as-is, as per
// convention to use the external id string.
parents: [
getIssueDocumentId(repoId.toString(), issue.number),
repoId.toString(),
],
retries: 3,
delayBetweenRetriesMs: 500,
loggerArgs: { ...loggerArgs, provider: "github" },
Expand Down Expand Up @@ -281,7 +290,16 @@ export async function githubUpsertDiscussionActivity(
documentUrl: discussion.url,
timestampMs: new Date(discussion.createdAt).getTime(),
tags,
parents: [],
// The convention for parents is to use the external id string; it is ok for
// repos, but not practical for discussions since the external id is the
// issue number, which is not guaranteed unique in the workspace. Therefore
// as a special case we use getDiscussionDocumentId() to get a parent string
// The repo id from github is globally unique so used as-is, as per
// convention to use the external id string.
parents: [
getDiscussionDocumentId(repoId.toString(), discussionNumber),
repoId.toString(),
],
retries: 3,
delayBetweenRetriesMs: 500,
loggerArgs: { ...loggerArgs, provider: "github" },
Expand Down Expand Up @@ -585,11 +603,14 @@ function renderGithubUser(user: GithubUser | null): string {
return `@${user.id}`;
}

function getIssueDocumentId(repoId: string, issueNumber: number): string {
export function getIssueDocumentId(
repoId: string,
issueNumber: number
): string {
return `github-issue-${repoId}-${issueNumber}`;
}

function getDiscussionDocumentId(
export function getDiscussionDocumentId(
repoId: string,
discussionNumber: number
): string {
Expand Down
15 changes: 13 additions & 2 deletions connectors/src/types/resources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,19 @@ export type ConnectorResourceType = "file" | "folder" | "database" | "channel";
* - Slack: channels
* - GoogleDrive: shared drive or sub-folders of shared drives.
*
* `internalId` and `parentInternalId` are internal opaque identifiers that should enable
* reconstructing the tree structure of the resources.
* `internalId` and `parentInternalId` are internal opaque identifiers that
* should enable reconstructing the tree structure of the resources.
*
* Those ids must be aligned with those used in the "parents" field of data
* sources documents, to enable search filter on documents based on their
* parents, see the
*
* The convention to use for internal ids are to always use the externally
* provided id when possible (e.g. Notion page id, Github repository id,
* etc...). When not possible, such as for Github issues whose id is not
* workspace-unique, a custom function to create a unique id is created, and
* used both in the parents field management code and the connectors resource
* code.
*/
export type ConnectorResource = {
provider: ConnectorProvider;
Expand Down
38 changes: 21 additions & 17 deletions core/src/data_sources/data_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,19 +95,25 @@ pub struct Chunk {
/// The "parents" field is an array of ids of parents to the document,
/// corresponding to its hierarchy, ordered by closest parent first.
///
/// At index 0 is the document id itself, then at index 1 its direct parent,
/// then at index 2 is the direct parent of the element represented at index 1,
/// etc. It is assumed that a document (or folder, or hierarchical level) only
/// has at most one direct parent. Therefore, there is an unambiguous mapping
/// between the parents array and the document's hierarchical position. For
/// example, for a regular file system (or filesystem-like such as Google
/// Drive), each parent would correspond to a subfolder in the path to the
/// document.
/// Parents are at the time of writing only relevant for managed datasources
/// since standard datasources do not allow specifying a hierarchy. A parent is
/// represented by a string of characters which correspond to the parent's
/// external id, provided by the managed datasource’s API (e.g. the Notion id
/// for Notion pages and databases).
///
/// At index 0 is the string id of the document itself, then at index 1 its
/// direct parent, then at index 2 is the direct parent of the element
/// represented at index 1, etc. It is assumed that a document (or folder, or
/// hierarchical level) only has at most one direct parent. Therefore, there is
/// an unambiguous mapping between the parents array and the document's
/// hierarchical position. For example, for a regular file system (or
/// filesystem-like such as Google Drive), each parent would correspond to a
/// subfolder in the path to the document.
///
/// The document’s id is stored in the field, since the field is used in
/// filtering search to search only parts of the hierarchy: it is natural that
/// if the document’s id is selected as a parent filter, the document itself
/// shows up in the search.
/// The id of the document itself is stored at index 0 because the field is used
/// in filtering search to search only parts of the hierarchy: it is natural
/// that if the document’s id is selected as a parent filter, the document
/// itself shows up in the search.
///
///
/// Note however that the hierarchical system depends on the managed datasource.
Expand All @@ -116,11 +122,9 @@ pub struct Chunk {
/// no slack ids for our slack "documents" so the only value in the parents
/// array is the slack channel id
///
/// Parents are at the time of writing only relevant for managed datasources
/// since standard datasources do not allow specifying a hierarchy. A parent is
/// represented by a string of characters which correspond to the parent's
/// internal id (specific to the managed datasource)--not its name--provided by
/// the managed datasource.
/// For github, github issues / discussions do not have a proper external id, so
/// we use our computed document id. The repo is considered a parent, and has a
/// proper external “repo id”, which is stored at 2nd place in the array
#[derive(Debug, Serialize, Clone)]
pub struct Document {
Expand Down

0 comments on commit 7a20e7e

Please sign in to comment.