Skip to content

Commit

Permalink
comments from pr + adaptations from discussions
Browse files Browse the repository at this point in the history
  • Loading branch information
philipperolet committed Sep 7, 2023
1 parent 319f946 commit de327b6
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 31 deletions.
15 changes: 9 additions & 6 deletions connectors/migrations/20230906_3_github_fill_parents_field.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ import {
} from "@connectors/lib/models";

async function main() {
if (!process.argv[2]) {
console.error("Missing workspace id or 'all' as first argument");
process.exit(1);
}
// if first arg is "all", update all connectors, else update only the
// connector for the corresponding workspace id
const connectors =
Expand Down Expand Up @@ -42,7 +46,7 @@ async function updateDiscussionsParentsFieldForConnector(connector: Connector) {
},
attributes: ["repoId", "discussionNumber"],
});
// update all parents fields for all pages and databases by chunks of 128
// update all parents fields for discussions by chunks of 128
const chunkSize = 128;
for (let i = 0; i < documentData.length; i += chunkSize) {
const chunk = documentData.slice(i, i + chunkSize);
Expand All @@ -55,7 +59,7 @@ async function updateDiscussionsParentsFieldForConnector(connector: Connector) {
document.discussionNumber
);
await updateDocumentParentsField(connector, docId, [
document.discussionNumber.toString(),
getDiscussionDocumentId(document.repoId, document.discussionNumber),
document.repoId,
]);
})
Expand All @@ -64,15 +68,14 @@ async function updateDiscussionsParentsFieldForConnector(connector: Connector) {
}

async function updateIssuesParentsFieldForConnector(connector: Connector) {
// get all distinct documentIds and their channel ids from slack messages in
// this connector
// get all distinct issues and their repo ids fro
const documentData = await GithubIssue.findAll({
where: {
connectorId: connector.id,
},
attributes: ["repoId", "issueNumber"],
});
// update all parents fields for all pages and databases by chunks of 128
// update all parents fields for all issues by chunks of 128
const chunkSize = 128;
for (let i = 0; i < documentData.length; i += chunkSize) {
const chunk = documentData.slice(i, i + chunkSize);
Expand All @@ -82,7 +85,7 @@ async function updateIssuesParentsFieldForConnector(connector: Connector) {
chunk.map(async (document) => {
const docId = getIssueDocumentId(document.repoId, document.issueNumber);
await updateDocumentParentsField(connector, docId, [
document.issueNumber.toString(),
getIssueDocumentId(document.repoId, document.issueNumber),
document.repoId,
]);
})
Expand Down
18 changes: 16 additions & 2 deletions connectors/src/connectors/github/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,14 @@ export async function githubUpsertIssueActivity(
documentUrl: issue.url,
timestampMs: lastUpdateTimestamp,
tags: tags,
parents: [issueNumber.toString(), repoId.toString()],
// The convention for parents is to use the external id string; it is ok for
// repos, but not practical for issues since the external id is the
// issue number, which is not guaranteed unique in the workspace.
// Therefore as a special case we use getIssueDocumentId() to get a parent string
parents: [
getIssueDocumentId(repoId.toString(), issue.number),
repoId.toString(),
],
retries: 3,
delayBetweenRetriesMs: 500,
loggerArgs: { ...loggerArgs, provider: "github" },
Expand Down Expand Up @@ -281,7 +288,14 @@ export async function githubUpsertDiscussionActivity(
documentUrl: discussion.url,
timestampMs: new Date(discussion.createdAt).getTime(),
tags,
parents: [discussionNumber.toString(), repoId.toString()],
// The convention for parents is to use the external id string; it is ok for
// repos, but not practical for discussions since the external id is the
// issue number, which is not guaranteed unique in the workspace.
// Therefore as a special case we use getDiscussionDocumentId() to get a parent string
parents: [
getDiscussionDocumentId(repoId.toString(), discussionNumber),
repoId.toString(),
],
retries: 3,
delayBetweenRetriesMs: 500,
loggerArgs: { ...loggerArgs, provider: "github" },
Expand Down
48 changes: 25 additions & 23 deletions core/src/data_sources/data_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,33 +95,35 @@ pub struct Chunk {
/// The "parents" field is an array of ids of parents to the document,
/// corresponding to its hierarchy, ordered by closest parent first.
///
/// At index 0 is the document id itself, then at index 1 its direct parent,
/// then at index 2 is the direct parent of the element represented at index 1,
/// etc. It is assumed that a document (or folder, or hierarchical level) only
/// has at most one direct parent. Therefore, there is an unambiguous mapping
/// between the parents array and the document's hierarchical position. For
/// example, for a regular file system (or filesystem-like such as Google
/// Drive), each parent would correspond to a subfolder in the path to the
/// document.
/// Parents are at the time of writing only relevant for managed datasources
/// since standard datasources do not allow specifying a hierarchy. A parent is
/// represented by a string of characters which correspond to the parent's
/// external id, provided by the managed datasource’s API (e.g. the Notion id
/// for Notion pages and databases).
///
/// The document’s id is stored in the field, since the field is used in
/// filtering search to search only parts of the hierarchy: it is natural that
/// if the document’s id is selected as a parent filter, the document itself
/// shows up in the search.
/// At index 0 is the string id of the document itself, then at index 1 its
/// direct parent, then at index 2 is the direct parent of the element
/// represented at index 1, etc. It is assumed that a document (or folder, or
/// hierarchical level) only has at most one direct parent. Therefore, there is
/// an unambiguous mapping between the parents array and the document's
/// hierarchical position. For example, for a regular file system (or
/// filesystem-like such as Google Drive), each parent would correspond to a
/// subfolder in the path to the document.
///
/// Note however that the hierarchical system depends on the managed datasource.
/// For example, in the Slack managed datasource, documents only have a single
/// parent in the array, their channel (since a channel does not have any
/// parent).
/// The id of the document itself is stored at index 0 because the field is used
/// in filtering search to search only parts of the hierarchy: it is natural
/// that if the document’s id is selected as a parent filter, the document
/// itself shows up in the search.
///
/// For github, we store the github issue / discussion's number (as string), and
/// the repo name (its "parent" in the hierarchy).
/// Note however that the hierarchical system depends on the managed datasource.
/// For example, in the Slack managed datasource, documents are aggregated
/// messages from a channel. A channel does not have any parent, and there are
/// no slack ids for our slack "documents" so the only value in the parents
/// array is the slack channel id
///
/// Parents are at the time of writing only relevant for managed datasources
/// since standard datasources do not allow specifying a hierarchy. A parent is
/// represented by a string of characters which correspond to the parent's
/// internal id (specific to the managed datasource)--not its name--provided by
/// the managed datasource.
/// For github, github issues / discussions do not have a proper external id, so
/// we use our computed document id. The repo is considered a parent, and has a
/// proper external “repo id”, which is stored at 2nd place in the array
#[derive(Debug, Serialize, Clone)]
pub struct Document {
Expand Down

0 comments on commit de327b6

Please sign in to comment.