Skip to content

Commit

Permalink
snapshot
Browse files Browse the repository at this point in the history
  • Loading branch information
lucasrodes committed Nov 5, 2024
1 parent 1ecb8c6 commit cca85b2
Show file tree
Hide file tree
Showing 10 changed files with 535 additions and 0 deletions.
180 changes: 180 additions & 0 deletions snapshots/covid/2024-11-05/get_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# issues

# issue_id, author_name, author_login, date_created
# issue.id, issue.user.name, issue.user.login, issue.created_at


# comments

# comment_id, date_created, date_updated, user_id, issue_id
# comment.id, comment.created_at, comment.updated_at, user_id, issue.id


# users

# user_id, user_login, user_name, user_location
# user.id, user.login, user.name, user.location


from datetime import datetime
from typing import Optional

import github
import github.PullRequest
import github.Repository
import pandas as pd
from github import Auth, Github

from etl import config

# FLAGS
EXECUTE_ISSUES = False
EXECUTE_PRS = False
EXECUTE_COMMIT = True


def get_repo(repo_name: str, access_token: Optional[str] = None) -> github.Repository.Repository:
"""Get repository."""
if not access_token:
assert config.OWIDBOT_ACCESS_TOKEN, "OWIDBOT_ACCESS_TOKEN is not set"
access_token = config.OWIDBOT_ACCESS_TOKEN
auth = Auth.Token(access_token)
g = Github(auth=auth)
return g.get_repo(f"owid/{repo_name}")


def process_issue(issue_or_pr, users):
"""Function to process each issue and its comments."""
issue_or_pr_data = {
"issue_id": issue_or_pr.number,
"author_name": issue_or_pr.user.name,
"author_login": issue_or_pr.user.login,
"date_created": issue_or_pr.created_at.strftime("%Y-%m-%d %H:%M:%S"),
"is_pr": "pull/" in issue_or_pr.html_url,
}
issue_or_pr_comments = []

for comment in issue_or_pr.get_comments():
user = comment.user
issue_or_pr_comments.append(
{
"comment_id": comment.id,
"date_created": comment.created_at.strftime("%Y-%m-%d %H:%M:%S"),
"date_updated": comment.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
"user_id": user.id,
"issue_id": issue_or_pr.number,
}
)

if user.id not in users:
users[user.id] = {
"user_login": user.login,
"user_name": user.name,
"user_location": user.location,
}

return issue_or_pr_data, issue_or_pr_comments, users


# Get repository
repo = get_repo("covid-19-data", access_token=config.GITHUB_TOKEN)


######################################################
# GET DATA FOR ISSUES
######################################################
if EXECUTE_ISSUES:
# Initialize lists (we will store output data here)
issues = []
comments = []
users = {}

# Get issues
issues_raw = repo.get_issues(state="all")
total_issues = issues_raw.totalCount # Total number of issues for progress tracking

# Retrieve relevant data (several API calls)
for i, issue in enumerate(issues_raw):
issue_data, issue_comments, users = process_issue(issue, users)
issues.append(issue_data)
comments.extend(issue_comments)
print(f"Progress: {i}/{total_issues} issues processed")

# Export
rand = str(datetime.now().strftime("%Y%m%d%H%M%S"))
pd.DataFrame(comments).to_csv(f"gh_stats/comments-issues-{rand}.csv", index=False)
pd.DataFrame(issues).to_csv(f"gh_stats/issues-{rand}.csv", index=False)
pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/users-issues-{rand}.csv", index=False)

######################################################
# GET DATA FOR PRS
######################################################
if EXECUTE_PRS:
# Initialize lists (we will store output data here)
prs = []
comments = []
users = {}

# Get PRs
prs_raw = repo.get_pulls(state="all")
total_prs = prs_raw.totalCount # Total number of PRs for progress tracking

# Retrieve relevant data (several API calls)
for i, pr in enumerate(prs_raw):
pr_data, issue_comments, users = process_issue(pr, users)
prs.append(pr_data)
comments.extend(issue_comments)
print(f"Progress: {i}/{total_prs} PRs processed")

# Export
rand = str(datetime.now().strftime("%Y%m%d%H%M%S"))
pd.DataFrame(comments).to_csv(f"gh_stats/comments-prs-{rand}.csv", index=False)
pd.DataFrame(prs).to_csv(f"gh_stats/prs-{rand}.csv", index=False)
pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/users-prs-{rand}.csv", index=False)

######################################################
# GET DATA FOR COMMITS
######################################################
if EXECUTE_COMMIT:
# Initialize lists (we will store output data here)
commits = []
users = {}

# Get commits
commits_raw = repo.get_commits()
total_commits = commits_raw.totalCount # Total number of commits for progress tracking

# Retrieve relevant data (several API calls)
for i, c in enumerate(commits_raw):
if i % 10 == 0:
print(f"Progress: {i}/{total_commits} commits processed")
user = c.author
commit_raw = {
"sha": c.sha,
"date": c.commit.author.date.strftime("%Y-%m-%d %H:%M:%S"),
"files_changed": len(c.files),
"lines_changed": c.stats.total,
"lines_deleted": c.stats.deletions,
"lines_added": c.stats.additions,
"user_id": user.id,
}
commits.append(commit_raw)
# Add user
if user.id not in users:
users[user.id] = {
"user_login": user.login,
"user_name": user.name,
"user_location": user.location,
}

if (i != 0) & (i % 50 == 0):
# Export
print(f"Exporting {i}...")
rand = str(datetime.now().strftime("%Y%m%d%H%M%S"))
pd.DataFrame(commits).to_csv(f"gh_stats/commits/{i}-commits-{rand}.csv", index=False)
pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/commits/{i}-users-commits-{rand}.csv", index=False)

# Export
rand = str(datetime.now().strftime("%Y%m%d%H%M%S"))
pd.DataFrame(commits).to_csv(f"gh_stats/commits/total-commits-{rand}.csv", index=False)
pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/commits/total-users-commits-{rand}.csv", index=False)
83 changes: 83 additions & 0 deletions snapshots/covid/2024-11-05/github_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Script to create a snapshot of dataset.
The data for these snapshots has been manually obtained via the GitHub API. We have obtained data on
- Issues: Comments, users that participated, etc.
- Pull Requests: Comments, users that participated, etc.
- Commits: Users that participated, etc.
If you want to retrieve this data again, please look at the script `get_stats.py` in the same folder. You can simply execute it. To run different parts of the script please use the variables at the top of the script EXECUTE_ISSUES, EXECUTE_PRS, EXECUTE_COMMIT.
python snapshots/covid/2024-11-05/github_stats.py \
--issues gh_stats/issues-20241104000000.csv \
--issues-comments gh_stats/comments-issues-20241104000000.csv \
--issues-users gh_stats/users-issues-20241104000000.csv \
--pr gh_stats/prs-20241105104652.csv \
--pr-comments gh_stats/comments-prs-20241105104652.csv \
--pr-users gh_stats/users-prs-20241105104652.csv \
--commits gh_stats/commits/8800-commits-20241105165504.csv \
--commits-users gh_stats/commits/8800-users-commits-20241105165504.csv
"""

from pathlib import Path
from typing import Optional

import click
from structlog import get_logger

from etl.snapshot import Snapshot

log = get_logger()


# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
@click.option("--issues", type=str, help="File with data on issues.")
@click.option("--issues-comments", type=str, help="File with data on issues comments.")
@click.option("--issues-users", type=str, help="File with data on users that commented in issues.")
@click.option("--pr", type=str, help="File with data on PRs.")
@click.option("--pr-comments", type=str, help="File with data on PR comments.")
@click.option("--pr-users", type=str, help="File with data on users that commented in PRs.")
@click.option("--commits", type=str, help="File with data on commits.")
@click.option("--commits-users", type=str, help="File with data on commit users.")
def main(
upload: bool,
issues: Optional[str] = None,
issues_comments: Optional[str] = None,
issues_users: Optional[str] = None,
pr: Optional[str] = None,
pr_comments: Optional[str] = None,
pr_users: Optional[str] = None,
commits: Optional[str] = None,
commits_users: Optional[str] = None,
) -> None:
snapshot_paths = [
(issues, "github_stats_issues.csv"),
(issues_comments, "github_stats_issues_comments.csv"),
(issues_users, "github_stats_issues_users.csv"),
(pr, "github_stats_pr.csv"),
(pr_comments, "github_stats_pr_comments.csv"),
(pr_users, "github_stats_pr_users.csv"),
(commits, "github_stats_commits.csv"),
(commits_users, "github_stats_commits_users.csv"),
]

for paths in snapshot_paths:
if paths[0] is not None:
log.info(f"Importing {paths[1]}.")
# Create a new snapshot.
snap = Snapshot(f"covid/{SNAPSHOT_VERSION}/{paths[1]}")

# Copy local data file to snapshots data folder, add file to DVC and upload to S3.
snap.create_snapshot(filename=paths[0], upload=upload)
else:
log.warning(f"Skipping import for {paths[1]}.")


if __name__ == "__main__":
main()
34 changes: 34 additions & 0 deletions snapshots/covid/2024-11-05/github_stats_commits.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: GitHub stats on owid/covid-19-data repository
description: |-
During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more.

This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible.
title_snapshot: "GitHub stats on owid/covid-19-data repository: Commits"
description_snapshot: |-
This snapshot contains the list of commits to the owid/covid-19-data GitHub repository.

date_published: 2024-11-05

# Citation
producer: Our World in Data
citation_full: |-
Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data.

# Files
url_main: https://github.com/owid/covid-19-data
date_accessed: 2024-11-05

# License
license:
name: CC BY 4.0
url: https://creativecommons.org/licenses/by/4.0/

outs:
- md5: 45236ca93183af7d057bb47afe37d715
size: 725503
path: github_stats_commits.csv
34 changes: 34 additions & 0 deletions snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: GitHub stats on owid/covid-19-data repository
description: |-
During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more.

This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible.
title_snapshot: "GitHub stats on owid/covid-19-data repository: Users from Commits"
description_snapshot: |-
This snapshot contains the list of users that submitted commits to the owid/covid-19-data GitHub repository.

date_published: 2024-11-05

# Citation
producer: Our World in Data
citation_full: |-
Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data.

# Files
url_main: https://github.com/owid/covid-19-data
date_accessed: 2024-11-05

# License
license:
name: CC BY 4.0
url: https://creativecommons.org/licenses/by/4.0/

outs:
- md5: ada1edd0b7fb873af437894bd7d13779
size: 301
path: github_stats_commits_users.csv
34 changes: 34 additions & 0 deletions snapshots/covid/2024-11-05/github_stats_issues.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: GitHub stats on owid/covid-19-data repository
description: |-
During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more.

This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible.
title_snapshot: "GitHub stats on owid/covid-19-data repository: Issues"
description_snapshot: |-
This snapshot contains the list of created issues on the owid/covid-19-data GitHub repository.

date_published: 2024-11-05

# Citation
producer: Our World in Data
citation_full: |-
Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data.

# Files
url_main: https://github.com/owid/covid-19-data
date_accessed: 2024-11-05

# License
license:
name: CC BY 4.0
url: https://creativecommons.org/licenses/by/4.0/

outs:
- md5: 8d8a60f7da5dfadfc22e9f76e79ef29f
size: 109219
path: github_stats_issues.csv
Loading

0 comments on commit cca85b2

Please sign in to comment.