Skip to content

Commit

Permalink
Integrate Github Discussion in scraper and update scraper-dry-run wor…
Browse files Browse the repository at this point in the history
…kflow
  • Loading branch information
dgparmar14 committed Jun 18, 2024
1 parent 6cc5dac commit a1491b9
Show file tree
Hide file tree
Showing 8 changed files with 224 additions and 13 deletions.
25 changes: 20 additions & 5 deletions .github/workflows/scraper-dry-run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,39 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: setup python
uses: actions/setup-python@v3
- name: Setup Node.js
uses: actions/setup-node@v3
with:
python-version: "3.10"
node-version: "20"

- name: Install pnpm
run: npm install -g pnpm

- name: Install dependencies
run: pip install -r scraper/requirements.txt
run: pnpm install --frozen-lockfile
working-directory: scraper

- name: Build the project
run: pnpm build
working-directory: scraper

- name: Scrape data from GitHub
run: python scraper/src/github.py ${{ github.repository_owner }} data/github -l DEBUG
run: pnpm start -- ${{ github.repository_owner }} data/github
working-directory: scraper
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Verify scraper output
run: ls -l data/github

- name: Generate markdown files for new contributors
run: node scripts/generateNewContributors.js
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Verify generated markdown files
run: ls -l contributors

- uses: actions/upload-artifact@v4
with:
name: output
Expand Down
132 changes: 132 additions & 0 deletions scraper/src/github-scraper/discussion.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import { octokit } from "./config.js";
import { Discussion, Edge } from "./types.js";

// Query to fetch discussions from GitHub
const query = `
query($org: String!, $cursor: String) {
organization(login: $org) {
repositories(first: 100, after: $cursor) {
pageInfo {
hasNextPage
endCursor
}
edges {
node {
name
discussions(first: 100) {
edges {
node {
id
title
author {
login
avatarUrl
}
url
category{
id
name
emoji
}
upvoteCount
reactions {
totalCount
}
comments(first: 10) {
edges {
node {
author {
login
avatarUrl
}
upvoteCount
isAnswer
}
}
}
createdAt
isAnswered
}
}
}
}
}
}
}
}
`;
async function fetchDiscussionsForOrg(org: string, cursor = null) {
const variables = { org, cursor };
const response = await octokit.graphql(query, variables);
const discussions = response.organization.repositories.edges.map(
(edge: Edge) => edge.node.discussions.edges,
);
const hasNextPage = response.organization.repositories.pageInfo.hasNextPage;
const nextCursor = response.organization.repositories.pageInfo.endCursor;

const allDiscussions = discussions.flat();

if (hasNextPage) {
const nextDiscussions: Discussion[] = await fetchDiscussionsForOrg(
org,
nextCursor,
);
return allDiscussions.concat(nextDiscussions);
}

return allDiscussions;
}

async function parseDiscussionData(allDiscussions: Discussion[]) {
const authorList = allDiscussions
.map((d: Discussion) =>
d.node.comments.edges.map((c) => c.node.author.login),
)
.flat();
authorList.push(...allDiscussions.map((d) => d.node.author.login));
const uniqueAuthors = [...new Set(authorList)];
const authorDiscussionList = uniqueAuthors.map((author) => {
const discussions = allDiscussions.filter(
(d) =>
d.node.author.login === author ||
d.node.comments.edges.some((c) => c.node.author.login === author),
);
const data = discussions.map((d) => {
return {
id: d.node.id,
title: d.node.title,
url: d.node.url,
createdAt: d.node.createdAt,
author: d.node.author,
category: d.node.category,
isAnswered: d.node.isAnswered,
upvoteCount: d.node.upvoteCount,
participants: [
...new Map(
d.node.comments.edges.map((c) => [
c.node.author.login,
{
login: c.node.author.login,
avatarUrl: c.node.author.avatarUrl,
isAnswer: c.node.isAnswer,
upvoteCount: c.node.upvoteCount,
},
]),
).values(),
],
};
});
return { user: author, discussions: data };
});
return authorDiscussionList;
}

export async function fetchAllDiscussionEventsByOrg(organizationName: string) {
try {
const allDiscussions = await fetchDiscussionsForOrg(organizationName);
const parseDiscussion = await parseDiscussionData(allDiscussions);
return parseDiscussion;
} catch (error: any) {
throw new Error(`Error fetching discussions: ${error.message}`);
}
}
20 changes: 18 additions & 2 deletions scraper/src/github-scraper/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ import { fetch_merge_events, fetchOpenPulls } from "./fetchUserData.js";
import { fetchEvents } from "./fetchEvents.js";
import { parseEvents } from "./parseEvents.js";
import { merged_data } from "./saveData.js";
import { fetchAllDiscussionEventsByOrg } from "./discussion.js";

let processedData: ProcessData = {};

const scrapeGitHub = async (
org: string,
date: string,
numDays: number = 1,
orgName: string,
): Promise<void> => {
const endDate: Date = startOfDay(parseISO(date));
const startDate: Date = startOfDay(subDays(endDate, numDays));
Expand All @@ -24,14 +26,14 @@ const scrapeGitHub = async (
endDate,
)) as IGitHubEvent[];
processedData = await parseEvents(events);

for (const user of Object.keys(processedData)) {
if (!processedData[user]) {
processedData[user] = {
authored_issue_and_pr: [],
last_updated: "",
activity: [],
open_prs: [],
discussions: [],
};
}
try {
Expand All @@ -51,6 +53,20 @@ const scrapeGitHub = async (
console.error(`Error fetching open pulls for ${user}: ${e}`);
}
}
const discussions = await fetchAllDiscussionEventsByOrg(orgName);
console.log("Scraping discussions");
discussions.forEach((d) => {
if (!processedData[d.user]) {
processedData[d.user] = {
authored_issue_and_pr: [],
last_updated: "",
activity: [],
open_prs: [],
discussions: [],
};
}
processedData[d.user].discussions = d.discussions;
});

console.log("Scraping completed");
};
Expand All @@ -73,7 +89,7 @@ const main = async () => {
process.exit(1);
}

await scrapeGitHub(orgName, date, Number(numDays));
await scrapeGitHub(orgName, date, Number(numDays), orgName);
await merged_data(dataDir, processedData);
console.log("Done");
};
Expand Down
7 changes: 6 additions & 1 deletion scraper/src/github-scraper/parseEvents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ function appendEvent(user: string, event: Activity) {
activity: [event],
open_prs: [],
authored_issue_and_pr: [],
discussions: [],
};
} else {
processedData[user]["activity"].push(event);
Expand Down Expand Up @@ -123,7 +124,11 @@ export const parseEvents = async (events: IGitHubEvent[]) => {
const user = event.actor.login;
if (isBlacklisted(user)) continue;

console.log("Processing event for user:", user + " | " + "event_id : ", event.id);
console.log(
"Processing event for user:",
user + " | " + "event_id : ",
event.id,
);

switch (event.type) {
case "IssueCommentEvent":
Expand Down
44 changes: 44 additions & 0 deletions scraper/src/github-scraper/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ export interface ActivityData {
open_prs: OpenPr[];
pr_stale?: number;
authored_issue_and_pr: AuthoredIssueAndPr[];
discussions?: any;
}
export interface ProcessData {
[key: string]: ActivityData;
Expand Down Expand Up @@ -194,3 +195,46 @@ export interface AuthoredIssueAndPr {
issue_link: string;
pr_link: string;
}

export type Discussion = {
node: {
id: string;
title: string;
author: {
login: string;
avatarUrl: string;
};
url: string;
category: {
id: string;
name: string;
emoji: string;
};
upvoteCount: number;
reactions: {
totalCount: number;
};
comments: {
edges: {
node: {
author: {
login: string;
avatarUrl: string;
};
upvoteCount: number;
isAnswer: boolean;
};
}[];
};
createdAt: string;
isAnswered: boolean;
};
};

export type Edge = {
node: {
discussions: {
edges: Discussion[];
};
};
};
5 changes: 2 additions & 3 deletions scraper/src/github-scraper/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import { octokit } from "./config.js";
import { Action, ActivityData, PullRequestEvent } from "./types.js";
import { promises as fs } from "fs";


export const parseISODate = (isoDate: Date) => {
return new Date(isoDate);
};
Expand Down Expand Up @@ -89,8 +88,8 @@ export async function calculateTurnaroundTime(event: PullRequestEvent) {
assignedAts.length === 0
? null
: assignedAts.reduce((min, current) =>
current.time < min.time ? current : min,
).time;
current.time < min.time ? current : min,
).time;
const turnaroundTime =
(mergedAt.getTime() - (assignedAt || createdAt.getTime()).valueOf()) / 1000;
return turnaroundTime;
Expand Down
2 changes: 1 addition & 1 deletion scraper/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
"strict": true,
"skipLibCheck": true
},
"include": ["src/github-scraper/**/*"],
"include": ["src/github-scraper/**/*", "dist/discssion.js"],
"exclude": ["node_modules"]
}
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@
"@/*": ["./*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts", "scraper/dist/discssion.js"],
"exclude": ["node_modules", "data-repo", "data"]
}

0 comments on commit a1491b9

Please sign in to comment.