Skip to content

Commit

Permalink
Scraper githu.ts divided into different files for better understanding
Browse files Browse the repository at this point in the history
  • Loading branch information
dgparmar14 committed Jun 8, 2024
1 parent 57e9445 commit 0bb6d00
Show file tree
Hide file tree
Showing 14 changed files with 1,824 additions and 881 deletions.
436 changes: 221 additions & 215 deletions scraper/src/github.ts → github.ts

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lib/octokit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ const octokit = new Octokit({
auth: getGitHubAccessToken(),
});

export default octokit;
export default octokit;
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
"lint-fix": "eslint . --fix",
"format": "prettier --write .",
"load-data": "node ./scripts/loadOrgData.js",
"prepare": "husky install"
"prepare": "husky install",
},
"dependencies": {
"@headlessui/react": "^1.7.18",
"@octokit/core": "^5.2.0",
"@t3-oss/env-nextjs": "^0.9.2",
"@vercel/kv": "^1.0.1",
"clsx": "^1.2.1",
Expand Down
1,420 changes: 798 additions & 622 deletions scraper/package-lock.json

Large diffs are not rendered by default.

44 changes: 23 additions & 21 deletions scraper/package.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
{
"name": "scraper",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"@octokit/graphql": "^8.1.1",
"@octokit/types": "^13.5.0",
"date-fns": "^3.6.0",
"octokit": "^4.0.2",
"yargs": "^17.7.2"
},
"devDependencies": {
"typescript": "^5.4.5"
}
{
"name": "scraper",
"version": "1.0.0",
"description": "",
"main": "index.js",
"type": "module",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"scraper": "node --loader ts-node/esm src/github-scraper/index.ts coronasafe ../../data-repo/github"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"date-fns": "^3.6.0",
"octokit": "^4.0.2",
"yargs": "^17.7.2"
},
"devDependencies": {
"@types/node": "^16.11.18",
"ts-node": "^10.9.2",
"typescript": "^4.9.5"
}
}
11 changes: 11 additions & 0 deletions scraper/src/github-scraper/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { Octokit } from "octokit";

export const GITHUB_TOKEN = process.env.GITHUB_TOKEN;
if (!GITHUB_TOKEN) {
console.error("GITHUB_TOKEN not found in environment");
process.exit(1);
}

export const octokit = new Octokit({
auth: GITHUB_TOKEN,
});
52 changes: 52 additions & 0 deletions scraper/src/github-scraper/fetchEvents.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import { octokit } from "./config.js";
import { IGitHubEvent } from "./types.js";

export const fetchEvents = async (
org: string,
startDate: Date,
endDate: Date,
) => {
const events = await octokit.paginate(
"GET /orgs/{org}/events",
{
org: org,
per_page: 1000,
},
(response: { data: IGitHubEvent[] }) => {
return response.data;
},
);

let eventsCount: number = 0;
let filteredEvents = [];
for (const event of events) {
const eventTime: Date = new Date(event.created_at ?? 0);

if (eventTime > endDate) {
continue;
} else if (eventTime <= startDate) {
return filteredEvents;
}
const isBlacklisted: boolean = [
"dependabot",
"snyk-bot",
"codecov-commenter",
"github-actions[bot]",
].includes(event.actor.login);
const isRequiredEventType: boolean = [
"IssueCommentEvent",
"IssuesEvent",
"PullRequestEvent",
"PullRequestReviewEvent",
].includes(event.type ?? "");

if (!isBlacklisted && isRequiredEventType) {
console.log(event.type);
filteredEvents.push(event);
}
eventsCount++;
}
console.log("Fetched " + { eventsCount } + " events");

return filteredEvents;
};
63 changes: 63 additions & 0 deletions scraper/src/github-scraper/fetchUserData.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import { octokit } from "./config.js";
import { OpenPr } from "./types.js";
import { resolve_autonomy_responsibility } from "./utils.js";

export const fetch_merge_events = async (user: string, org: string) => {
console.log("Merge events for : ", user);

// Fetching closed issues authored by the user
const { data: issues } = await octokit.request("GET /search/issues", {
q: `is:issue is:closed org:${org} author:${user}`,
});

let merged_prs = [];

for (const issue of issues.items) {
const { data: timeline_events } = await octokit.request(
"GET " + issue.timeline_url,
);

for (const event of timeline_events) {
if (await resolve_autonomy_responsibility(event, user)) {
const pull_request = event.source.issue.pull_request;
if (pull_request && pull_request.merged_at) {
merged_prs.push({
issue_link: issue.html_url,
pr_link: pull_request.html_url,
});
}
}
}
}

return merged_prs;
};

export const fetchOpenPulls = async (user: string, org: string) => {
console.log(`Fetching open pull requests for ${user}`);
const { data } = await octokit.request("GET /search/issues", {
q: `is:pr is:open org:${org} author:${user}`,
});

type PullsData = (typeof data.items)[0];
let pulls: PullsData[] = data.items;
let open_prs: OpenPr[] = [];

pulls.forEach((pr: PullsData) => {
let today: Date = new Date();
let prLastUpdated: Date = new Date(pr.updated_at);
let staleFor: number = Math.floor(
(today.getTime() - prLastUpdated.getTime()) / (1000 * 60 * 60 * 24),
);

open_prs.push({
link: pr.html_url,
title: pr.title,
stale_for: staleFor,
labels: pr.labels.map((label: { name: string }) => label.name),
});
});

console.log(`Fetched ${pulls.length} open pull requests for ${user}`);
return open_prs;
};
81 changes: 81 additions & 0 deletions scraper/src/github-scraper/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import { formatISO, parseISO, startOfDay, subDays } from "date-fns";
import { IGitHubEvent, ProcessData } from "./types.js";
import { fetch_merge_events, fetchOpenPulls } from "./fetchUserData.js";
import { fetchEvents } from "./fetchEvents.js";
import { parseEvents } from "./parseEvents.js";
import { merged_data } from "./saveData.js";

let processedData: ProcessData = {};

const scrapeGitHub = async (
org: string,
date: string,
numDays: number = 1,
): Promise<void> => {
const endDate: Date = startOfDay(parseISO(date));
const startDate: Date = startOfDay(subDays(endDate, numDays));
console.log(
`Scraping GitHub data for ${org} from ${formatISO(startDate)} to ${formatISO(endDate)}`,
);

const events: IGitHubEvent[] = (await fetchEvents(
org,
startDate,
endDate,
)) as IGitHubEvent[];
processedData = await parseEvents(events);

for (const user of Object.keys(processedData)) {
if (!processedData[user]) {
processedData[user] = {
authored_issue_and_pr: [],
last_updated: "",
activity: [],
open_prs: [],
};
}
try {
const merged_prs = await fetch_merge_events(user, org);
for (const pr of merged_prs) {
processedData[user].authored_issue_and_pr.push(pr);
}
} catch (e) {
console.error(`Error fetching merge events for ${user}: ${e}`);
}
try {
const open_prs = await fetchOpenPulls(user, org);
for (const pr of open_prs) {
processedData[user].open_prs.push(pr);
}
} catch (e) {
console.error(`Error fetching open pulls for ${user}: ${e}`);
}
}

console.log("Scraping completed");
};

// Type Done and check done
const main = async () => {
// Extract command line arguments (skip the first two default arguments)
const args: string[] = process.argv.slice(2);

// Destructure arguments with default values
const [
orgName,
dataDir,
date = formatISO(subDays(new Date(), 1), { representation: "date" }),
numDays = 1,
] = args;

if (!orgName || !dataDir) {
console.error("Usage: node script.js <org> <dataDir> [date] [numDays]");
process.exit(1);
}

await scrapeGitHub(orgName, date, Number(numDays));
await merged_data(dataDir, processedData);
console.log("Done");
};

main();
Loading

0 comments on commit 0bb6d00

Please sign in to comment.