snapshot

owid · Nov 5, 2024 · cca85b2 · cca85b2
1 parent 1ecb8c6
commit cca85b2
Show file tree

Hide file tree

Showing 10 changed files with 535 additions and 0 deletions.
diff --git a/snapshots/covid/2024-11-05/get_stats.py b/snapshots/covid/2024-11-05/get_stats.py
@@ -0,0 +1,180 @@
+# issues
+
+# issue_id, author_name, author_login, date_created
+# issue.id, issue.user.name, issue.user.login, issue.created_at
+
+
+# comments
+
+# comment_id, date_created, date_updated, user_id, issue_id
+# comment.id, comment.created_at, comment.updated_at, user_id, issue.id
+
+
+# users
+
+# user_id, user_login, user_name, user_location
+# user.id, user.login, user.name, user.location
+
+
+from datetime import datetime
+from typing import Optional
+
+import github
+import github.PullRequest
+import github.Repository
+import pandas as pd
+from github import Auth, Github
+
+from etl import config
+
+# FLAGS
+EXECUTE_ISSUES = False
+EXECUTE_PRS = False
+EXECUTE_COMMIT = True
+
+
+def get_repo(repo_name: str, access_token: Optional[str] = None) -> github.Repository.Repository:
+    """Get repository."""
+    if not access_token:
+        assert config.OWIDBOT_ACCESS_TOKEN, "OWIDBOT_ACCESS_TOKEN is not set"
+        access_token = config.OWIDBOT_ACCESS_TOKEN
+    auth = Auth.Token(access_token)
+    g = Github(auth=auth)
+    return g.get_repo(f"owid/{repo_name}")
+
+
+def process_issue(issue_or_pr, users):
+    """Function to process each issue and its comments."""
+    issue_or_pr_data = {
+        "issue_id": issue_or_pr.number,
+        "author_name": issue_or_pr.user.name,
+        "author_login": issue_or_pr.user.login,
+        "date_created": issue_or_pr.created_at.strftime("%Y-%m-%d %H:%M:%S"),
+        "is_pr": "pull/" in issue_or_pr.html_url,
+    }
+    issue_or_pr_comments = []
+
+    for comment in issue_or_pr.get_comments():
+        user = comment.user
+        issue_or_pr_comments.append(
+            {
+                "comment_id": comment.id,
+                "date_created": comment.created_at.strftime("%Y-%m-%d %H:%M:%S"),
+                "date_updated": comment.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
+                "user_id": user.id,
+                "issue_id": issue_or_pr.number,
+            }
+        )
+
+        if user.id not in users:
+            users[user.id] = {
+                "user_login": user.login,
+                "user_name": user.name,
+                "user_location": user.location,
+            }
+
+    return issue_or_pr_data, issue_or_pr_comments, users
+
+
+# Get repository
+repo = get_repo("covid-19-data", access_token=config.GITHUB_TOKEN)
+
+
+######################################################
+# GET DATA FOR ISSUES
+######################################################
+if EXECUTE_ISSUES:
+    # Initialize lists (we will store output data here)
+    issues = []
+    comments = []
+    users = {}
+
+    # Get issues
+    issues_raw = repo.get_issues(state="all")
+    total_issues = issues_raw.totalCount  # Total number of issues for progress tracking
+
+    # Retrieve relevant data (several API calls)
+    for i, issue in enumerate(issues_raw):
+        issue_data, issue_comments, users = process_issue(issue, users)
+        issues.append(issue_data)
+        comments.extend(issue_comments)
+        print(f"Progress: {i}/{total_issues} issues processed")
+
+    # Export
+    rand = str(datetime.now().strftime("%Y%m%d%H%M%S"))
+    pd.DataFrame(comments).to_csv(f"gh_stats/comments-issues-{rand}.csv", index=False)
+    pd.DataFrame(issues).to_csv(f"gh_stats/issues-{rand}.csv", index=False)
+    pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/users-issues-{rand}.csv", index=False)
+
+######################################################
+# GET DATA FOR PRS
+######################################################
+if EXECUTE_PRS:
+    # Initialize lists (we will store output data here)
+    prs = []
+    comments = []
+    users = {}
+
+    # Get PRs
+    prs_raw = repo.get_pulls(state="all")
+    total_prs = prs_raw.totalCount  # Total number of PRs for progress tracking
+
+    # Retrieve relevant data (several API calls)
+    for i, pr in enumerate(prs_raw):
+        pr_data, issue_comments, users = process_issue(pr, users)
+        prs.append(pr_data)
+        comments.extend(issue_comments)
+        print(f"Progress: {i}/{total_prs} PRs processed")
+
+    # Export
+    rand = str(datetime.now().strftime("%Y%m%d%H%M%S"))
+    pd.DataFrame(comments).to_csv(f"gh_stats/comments-prs-{rand}.csv", index=False)
+    pd.DataFrame(prs).to_csv(f"gh_stats/prs-{rand}.csv", index=False)
+    pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/users-prs-{rand}.csv", index=False)
+
+######################################################
+# GET DATA FOR COMMITS
+######################################################
+if EXECUTE_COMMIT:
+    # Initialize lists (we will store output data here)
+    commits = []
+    users = {}
+
+    # Get commits
+    commits_raw = repo.get_commits()
+    total_commits = commits_raw.totalCount  # Total number of commits for progress tracking
+
+    # Retrieve relevant data (several API calls)
+    for i, c in enumerate(commits_raw):
+        if i % 10 == 0:
+            print(f"Progress: {i}/{total_commits} commits processed")
+        user = c.author
+        commit_raw = {
+            "sha": c.sha,
+            "date": c.commit.author.date.strftime("%Y-%m-%d %H:%M:%S"),
+            "files_changed": len(c.files),
+            "lines_changed": c.stats.total,
+            "lines_deleted": c.stats.deletions,
+            "lines_added": c.stats.additions,
+            "user_id": user.id,
+        }
+        commits.append(commit_raw)
+        # Add user
+        if user.id not in users:
+            users[user.id] = {
+                "user_login": user.login,
+                "user_name": user.name,
+                "user_location": user.location,
+            }
+
+        if (i != 0) & (i % 50 == 0):
+            # Export
+            print(f"Exporting {i}...")
+            rand = str(datetime.now().strftime("%Y%m%d%H%M%S"))
+            pd.DataFrame(commits).to_csv(f"gh_stats/commits/{i}-commits-{rand}.csv", index=False)
+            pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/commits/{i}-users-commits-{rand}.csv", index=False)
+
+    # Export
+    rand = str(datetime.now().strftime("%Y%m%d%H%M%S"))
+    pd.DataFrame(commits).to_csv(f"gh_stats/commits/total-commits-{rand}.csv", index=False)
+    pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/commits/total-users-commits-{rand}.csv", index=False)
diff --git a/snapshots/covid/2024-11-05/github_stats.py b/snapshots/covid/2024-11-05/github_stats.py
@@ -0,0 +1,83 @@
+"""Script to create a snapshot of dataset.
+
+The data for these snapshots has been manually obtained via the GitHub API. We have obtained data on
+
+- Issues: Comments, users that participated, etc.
+- Pull Requests: Comments, users that participated, etc.
+- Commits: Users that participated, etc.
+
+If you want to retrieve this data again, please look at the script `get_stats.py` in the same folder. You can simply execute it. To run different parts of the script please use the variables at the top of the script EXECUTE_ISSUES, EXECUTE_PRS, EXECUTE_COMMIT.
+
+    python snapshots/covid/2024-11-05/github_stats.py \
+        --issues gh_stats/issues-20241104000000.csv \
+        --issues-comments gh_stats/comments-issues-20241104000000.csv \
+        --issues-users gh_stats/users-issues-20241104000000.csv \
+        --pr gh_stats/prs-20241105104652.csv \
+        --pr-comments gh_stats/comments-prs-20241105104652.csv \
+        --pr-users gh_stats/users-prs-20241105104652.csv \
+        --commits gh_stats/commits/8800-commits-20241105165504.csv \
+        --commits-users gh_stats/commits/8800-users-commits-20241105165504.csv
+
+"""
+
+from pathlib import Path
+from typing import Optional
+
+import click
+from structlog import get_logger
+
+from etl.snapshot import Snapshot
+
+log = get_logger()
+
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+@click.option("--issues", type=str, help="File with data on issues.")
+@click.option("--issues-comments", type=str, help="File with data on issues comments.")
+@click.option("--issues-users", type=str, help="File with data on users that commented in issues.")
+@click.option("--pr", type=str, help="File with data on PRs.")
+@click.option("--pr-comments", type=str, help="File with data on PR comments.")
+@click.option("--pr-users", type=str, help="File with data on users that commented in PRs.")
+@click.option("--commits", type=str, help="File with data on commits.")
+@click.option("--commits-users", type=str, help="File with data on commit users.")
+def main(
+    upload: bool,
+    issues: Optional[str] = None,
+    issues_comments: Optional[str] = None,
+    issues_users: Optional[str] = None,
+    pr: Optional[str] = None,
+    pr_comments: Optional[str] = None,
+    pr_users: Optional[str] = None,
+    commits: Optional[str] = None,
+    commits_users: Optional[str] = None,
+) -> None:
+    snapshot_paths = [
+        (issues, "github_stats_issues.csv"),
+        (issues_comments, "github_stats_issues_comments.csv"),
+        (issues_users, "github_stats_issues_users.csv"),
+        (pr, "github_stats_pr.csv"),
+        (pr_comments, "github_stats_pr_comments.csv"),
+        (pr_users, "github_stats_pr_users.csv"),
+        (commits, "github_stats_commits.csv"),
+        (commits_users, "github_stats_commits_users.csv"),
+    ]
+
+    for paths in snapshot_paths:
+        if paths[0] is not None:
+            log.info(f"Importing {paths[1]}.")
+            # Create a new snapshot.
+            snap = Snapshot(f"covid/{SNAPSHOT_VERSION}/{paths[1]}")
+
+            # Copy local data file to snapshots data folder, add file to DVC and upload to S3.
+            snap.create_snapshot(filename=paths[0], upload=upload)
+        else:
+            log.warning(f"Skipping import for {paths[1]}.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc b/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc
@@ -0,0 +1,34 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: GitHub stats on owid/covid-19-data repository
+    description: |-
+      During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more.
+
+      This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible.
+    title_snapshot: "GitHub stats on owid/covid-19-data repository: Commits"
+    description_snapshot: |-
+      This snapshot contains the list of commits to the owid/covid-19-data GitHub repository.
+
+    date_published: 2024-11-05
+
+    # Citation
+    producer: Our World in Data
+    citation_full: |-
+      Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data.
+
+    # Files
+    url_main: https://github.com/owid/covid-19-data
+    date_accessed: 2024-11-05
+
+    # License
+    license:
+      name: CC BY 4.0
+      url: https://creativecommons.org/licenses/by/4.0/
+
+outs:
+  - md5: 45236ca93183af7d057bb47afe37d715
+    size: 725503
+    path: github_stats_commits.csv
diff --git a/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc b/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc
@@ -0,0 +1,34 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: GitHub stats on owid/covid-19-data repository
+    description: |-
+      During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more.
+
+      This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible.
+    title_snapshot: "GitHub stats on owid/covid-19-data repository: Users from Commits"
+    description_snapshot: |-
+      This snapshot contains the list of users that submitted commits to the owid/covid-19-data GitHub repository.
+
+    date_published: 2024-11-05
+
+    # Citation
+    producer: Our World in Data
+    citation_full: |-
+      Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data.
+
+    # Files
+    url_main: https://github.com/owid/covid-19-data
+    date_accessed: 2024-11-05
+
+    # License
+    license:
+      name: CC BY 4.0
+      url: https://creativecommons.org/licenses/by/4.0/
+
+outs:
+  - md5: ada1edd0b7fb873af437894bd7d13779
+    size: 301
+    path: github_stats_commits_users.csv
diff --git a/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc b/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc
@@ -0,0 +1,34 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: GitHub stats on owid/covid-19-data repository
+    description: |-
+      During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more.
+
+      This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible.
+    title_snapshot: "GitHub stats on owid/covid-19-data repository: Issues"
+    description_snapshot: |-
+      This snapshot contains the list of created issues on the owid/covid-19-data GitHub repository.
+
+    date_published: 2024-11-05
+
+    # Citation
+    producer: Our World in Data
+    citation_full: |-
+      Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data.
+
+    # Files
+    url_main: https://github.com/owid/covid-19-data
+    date_accessed: 2024-11-05
+
+    # License
+    license:
+      name: CC BY 4.0
+      url: https://creativecommons.org/licenses/by/4.0/
+
+outs:
+  - md5: 8d8a60f7da5dfadfc22e9f76e79ef29f
+    size: 109219
+    path: github_stats_issues.csv