-
Notifications
You must be signed in to change notification settings - Fork 112
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Github Sync updates github docs parents Field #1296
Changes from 33 commits
0a42e8d
605fbe3
8004a79
876225a
ca0e3f3
406ca6f
7303190
cf71c42
f2eff93
9a30d02
2c7efc6
5628ee9
1bdc7c8
120e2de
b8a3091
31a853d
2462e98
a350434
458bc7c
aaec779
0acb000
ed66a6a
51ffbb1
7c000fe
83bdaa4
16f74e1
853d7ea
af02ecb
9ae414a
4b73bb8
319f946
de327b6
74a96b9
8e8c090
572bf8d
140175a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import { | ||
getDiscussionDocumentId, | ||
getIssueDocumentId, | ||
} from "@connectors/connectors/github/temporal/activities"; | ||
import { updateDocumentParentsField } from "@connectors/lib/data_sources"; | ||
import { | ||
Connector, | ||
GithubDiscussion, | ||
GithubIssue, | ||
} from "@connectors/lib/models"; | ||
|
||
async function main() { | ||
if (!process.argv[2]) { | ||
console.error("Missing workspace id or 'all' as first argument"); | ||
process.exit(1); | ||
} | ||
// if first arg is "all", update all connectors, else update only the | ||
// connector for the corresponding workspace id | ||
const connectors = | ||
process.argv[2] === "all" | ||
? await Connector.findAll({ | ||
where: { | ||
type: "github", | ||
}, | ||
}) | ||
: await Connector.findAll({ | ||
where: { | ||
type: "github", | ||
workspaceId: process.argv[2], | ||
}, | ||
}); | ||
|
||
for (const connector of connectors) { | ||
console.log(`Updating parents field for connector ${connector.id}`); | ||
await updateDiscussionsParentsFieldForConnector(connector); | ||
await updateIssuesParentsFieldForConnector(connector); | ||
} | ||
} | ||
|
||
async function updateDiscussionsParentsFieldForConnector(connector: Connector) { | ||
// get all distinct documentIds and their channel ids from slack messages in | ||
// this connector | ||
const documentData = await GithubDiscussion.findAll({ | ||
where: { | ||
connectorId: connector.id, | ||
}, | ||
attributes: ["repoId", "discussionNumber"], | ||
}); | ||
// update all parents fields for discussions by chunks of 128 | ||
const chunkSize = 32; | ||
for (let i = 0; i < documentData.length; i += chunkSize) { | ||
const chunk = documentData.slice(i, i + chunkSize); | ||
console.log(`Updating ${chunk.length} documents`); | ||
// update parents field for each document of the chunk, in parallel | ||
await Promise.all( | ||
chunk.map(async (document) => { | ||
const docId = getDiscussionDocumentId( | ||
document.repoId, | ||
document.discussionNumber | ||
); | ||
await updateDocumentParentsField(connector, docId, [ | ||
getDiscussionDocumentId(document.repoId, document.discussionNumber), | ||
document.repoId, | ||
]); | ||
}) | ||
); | ||
} | ||
} | ||
|
||
async function updateIssuesParentsFieldForConnector(connector: Connector) { | ||
// get all distinct issues and their repo ids fro | ||
const documentData = await GithubIssue.findAll({ | ||
where: { | ||
connectorId: connector.id, | ||
}, | ||
attributes: ["repoId", "issueNumber"], | ||
}); | ||
// update all parents fields for all issues by chunks of 128 | ||
const chunkSize = 32; | ||
for (let i = 0; i < documentData.length; i += chunkSize) { | ||
const chunk = documentData.slice(i, i + chunkSize); | ||
console.log(`Updating ${chunk.length} documents`); | ||
// update parents field for each document of the chunk, in parallel | ||
await Promise.all( | ||
chunk.map(async (document) => { | ||
const docId = getIssueDocumentId(document.repoId, document.issueNumber); | ||
await updateDocumentParentsField(connector, docId, [ | ||
getIssueDocumentId(document.repoId, document.issueNumber), | ||
document.repoId, | ||
]); | ||
}) | ||
); | ||
} | ||
} | ||
|
||
main() | ||
.then(() => { | ||
console.log("Done"); | ||
process.exit(0); | ||
}) | ||
.catch((err) => { | ||
console.error(err); | ||
process.exit(1); | ||
}); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import { updateAllParentsFields } from "@connectors/connectors/notion/lib/parents"; | ||
import { Connector, NotionDatabase, NotionPage } from "@connectors/lib/models"; | ||
|
||
async function main() { | ||
// if first arg is "all", update all connectors, else update only the | ||
// connector for the corresponding workspace id | ||
const connectors = | ||
process.argv[2] === "all" | ||
? await Connector.findAll({ | ||
where: { | ||
type: "notion", | ||
}, | ||
}) | ||
: await Connector.findAll({ | ||
where: { | ||
type: "notion", | ||
workspaceId: process.argv[2], | ||
}, | ||
}); | ||
|
||
for (const connector of connectors) { | ||
console.log(`Updating parents field for connector ${connector.id}`); | ||
await updateParentsFieldForConnector(connector); | ||
} | ||
} | ||
|
||
async function updateParentsFieldForConnector(connector: Connector) { | ||
// get all pages and databases for this connector | ||
const pages = await NotionPage.findAll({ | ||
where: { | ||
connectorId: connector.id, | ||
}, | ||
}); | ||
const databases = await NotionDatabase.findAll({ | ||
where: { | ||
connectorId: connector.id, | ||
}, | ||
}); | ||
|
||
// update all parents fields for all pages and databases | ||
await updateAllParentsFields(connector, [...pages, ...databases]); | ||
} | ||
|
||
main() | ||
.then(() => { | ||
console.log("Done"); | ||
process.exit(0); | ||
}) | ||
.catch((err) => { | ||
console.error(err); | ||
process.exit(1); | ||
}); |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -154,7 +154,16 @@ export async function githubUpsertIssueActivity( | |
documentUrl: issue.url, | ||
timestampMs: lastUpdateTimestamp, | ||
tags: tags, | ||
parents: [], | ||
// The convention for parents is to use the external id string; it is ok for | ||
// repos, but not practical for issues since the external id is the | ||
// issue number, which is not guaranteed unique in the workspace. | ||
// Therefore as a special case we use getIssueDocumentId() to get a parent string | ||
// The repo id from github is globally unique so used as-is, as per | ||
// convention to use the external id string. | ||
parents: [ | ||
getIssueDocumentId(repoId.toString(), issue.number), | ||
repoId.toString(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just out of curiosity what does this Id look like? Is that a number? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we have 2 github managed data sources I worry we'll be clashing, which means we should probably update the connectors resource API to use something more specific? We want to make sure it's globally unique There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No clear doc about this, but I have created two repos at ~ 30mn interval and their repo ids are 688834560 and 688870568 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These look long enough to me 👍 :) |
||
], | ||
retries: 3, | ||
delayBetweenRetriesMs: 500, | ||
loggerArgs: { ...loggerArgs, provider: "github" }, | ||
|
@@ -281,7 +290,16 @@ export async function githubUpsertDiscussionActivity( | |
documentUrl: discussion.url, | ||
timestampMs: new Date(discussion.createdAt).getTime(), | ||
tags, | ||
parents: [], | ||
// The convention for parents is to use the external id string; it is ok for | ||
// repos, but not practical for discussions since the external id is the | ||
// issue number, which is not guaranteed unique in the workspace. Therefore | ||
// as a special case we use getDiscussionDocumentId() to get a parent string | ||
// The repo id from github is globally unique so used as-is, as per | ||
// convention to use the external id string. | ||
parents: [ | ||
getDiscussionDocumentId(repoId.toString(), discussionNumber), | ||
repoId.toString(), | ||
], | ||
retries: 3, | ||
delayBetweenRetriesMs: 500, | ||
loggerArgs: { ...loggerArgs, provider: "github" }, | ||
|
@@ -585,11 +603,14 @@ function renderGithubUser(user: GithubUser | null): string { | |
return `@${user.id}`; | ||
} | ||
|
||
function getIssueDocumentId(repoId: string, issueNumber: number): string { | ||
export function getIssueDocumentId( | ||
repoId: string, | ||
issueNumber: number | ||
): string { | ||
return `github-issue-${repoId}-${issueNumber}`; | ||
} | ||
|
||
function getDiscussionDocumentId( | ||
export function getDiscussionDocumentId( | ||
repoId: string, | ||
discussionNumber: number | ||
): string { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess it doesn't matter much since it a one off, but curious what happens if I forget argv[2] ? Does it basically does the same as
"all"
becauseworkspaceId: undefined
doesn't filter ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice catch. Let's validate that