From 1ecb8c638dcd5f7a5505c3d83747774ae3835062 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Mon, 4 Nov 2024 14:56:48 +0100 Subject: [PATCH 1/6] =?UTF-8?q?=F0=9F=93=8A=20covid:=20mdims?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From cca85b2f74685e28aaa6edacd5067ac07883e122 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Tue, 5 Nov 2024 16:58:24 +0100 Subject: [PATCH 2/6] snapshot --- snapshots/covid/2024-11-05/get_stats.py | 180 ++++++++++++++++++ snapshots/covid/2024-11-05/github_stats.py | 83 ++++++++ .../2024-11-05/github_stats_commits.csv.dvc | 34 ++++ .../github_stats_commits_users.csv.dvc | 34 ++++ .../2024-11-05/github_stats_issues.csv.dvc | 34 ++++ .../github_stats_issues_comments.csv.dvc | 34 ++++ .../github_stats_issues_users.csv.dvc | 34 ++++ .../covid/2024-11-05/github_stats_pr.csv.dvc | 34 ++++ .../github_stats_pr_comments.csv.dvc | 34 ++++ .../2024-11-05/github_stats_pr_users.csv.dvc | 34 ++++ 10 files changed, 535 insertions(+) create mode 100644 snapshots/covid/2024-11-05/get_stats.py create mode 100644 snapshots/covid/2024-11-05/github_stats.py create mode 100644 snapshots/covid/2024-11-05/github_stats_commits.csv.dvc create mode 100644 snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc create mode 100644 snapshots/covid/2024-11-05/github_stats_issues.csv.dvc create mode 100644 snapshots/covid/2024-11-05/github_stats_issues_comments.csv.dvc create mode 100644 snapshots/covid/2024-11-05/github_stats_issues_users.csv.dvc create mode 100644 snapshots/covid/2024-11-05/github_stats_pr.csv.dvc create mode 100644 snapshots/covid/2024-11-05/github_stats_pr_comments.csv.dvc create mode 100644 snapshots/covid/2024-11-05/github_stats_pr_users.csv.dvc diff --git a/snapshots/covid/2024-11-05/get_stats.py b/snapshots/covid/2024-11-05/get_stats.py new file mode 100644 index 00000000000..74efe7be8e9 --- /dev/null +++ b/snapshots/covid/2024-11-05/get_stats.py @@ -0,0 +1,180 @@ +# issues + +# issue_id, author_name, author_login, date_created +# issue.id, issue.user.name, issue.user.login, issue.created_at + + +# comments + +# comment_id, date_created, date_updated, user_id, issue_id +# comment.id, comment.created_at, comment.updated_at, user_id, issue.id + + +# users + +# user_id, user_login, user_name, user_location +# user.id, user.login, user.name, user.location + + +from datetime import datetime +from typing import Optional + +import github +import github.PullRequest +import github.Repository +import pandas as pd +from github import Auth, Github + +from etl import config + +# FLAGS +EXECUTE_ISSUES = False +EXECUTE_PRS = False +EXECUTE_COMMIT = True + + +def get_repo(repo_name: str, access_token: Optional[str] = None) -> github.Repository.Repository: + """Get repository.""" + if not access_token: + assert config.OWIDBOT_ACCESS_TOKEN, "OWIDBOT_ACCESS_TOKEN is not set" + access_token = config.OWIDBOT_ACCESS_TOKEN + auth = Auth.Token(access_token) + g = Github(auth=auth) + return g.get_repo(f"owid/{repo_name}") + + +def process_issue(issue_or_pr, users): + """Function to process each issue and its comments.""" + issue_or_pr_data = { + "issue_id": issue_or_pr.number, + "author_name": issue_or_pr.user.name, + "author_login": issue_or_pr.user.login, + "date_created": issue_or_pr.created_at.strftime("%Y-%m-%d %H:%M:%S"), + "is_pr": "pull/" in issue_or_pr.html_url, + } + issue_or_pr_comments = [] + + for comment in issue_or_pr.get_comments(): + user = comment.user + issue_or_pr_comments.append( + { + "comment_id": comment.id, + "date_created": comment.created_at.strftime("%Y-%m-%d %H:%M:%S"), + "date_updated": comment.updated_at.strftime("%Y-%m-%d %H:%M:%S"), + "user_id": user.id, + "issue_id": issue_or_pr.number, + } + ) + + if user.id not in users: + users[user.id] = { + "user_login": user.login, + "user_name": user.name, + "user_location": user.location, + } + + return issue_or_pr_data, issue_or_pr_comments, users + + +# Get repository +repo = get_repo("covid-19-data", access_token=config.GITHUB_TOKEN) + + +###################################################### +# GET DATA FOR ISSUES +###################################################### +if EXECUTE_ISSUES: + # Initialize lists (we will store output data here) + issues = [] + comments = [] + users = {} + + # Get issues + issues_raw = repo.get_issues(state="all") + total_issues = issues_raw.totalCount # Total number of issues for progress tracking + + # Retrieve relevant data (several API calls) + for i, issue in enumerate(issues_raw): + issue_data, issue_comments, users = process_issue(issue, users) + issues.append(issue_data) + comments.extend(issue_comments) + print(f"Progress: {i}/{total_issues} issues processed") + + # Export + rand = str(datetime.now().strftime("%Y%m%d%H%M%S")) + pd.DataFrame(comments).to_csv(f"gh_stats/comments-issues-{rand}.csv", index=False) + pd.DataFrame(issues).to_csv(f"gh_stats/issues-{rand}.csv", index=False) + pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/users-issues-{rand}.csv", index=False) + +###################################################### +# GET DATA FOR PRS +###################################################### +if EXECUTE_PRS: + # Initialize lists (we will store output data here) + prs = [] + comments = [] + users = {} + + # Get PRs + prs_raw = repo.get_pulls(state="all") + total_prs = prs_raw.totalCount # Total number of PRs for progress tracking + + # Retrieve relevant data (several API calls) + for i, pr in enumerate(prs_raw): + pr_data, issue_comments, users = process_issue(pr, users) + prs.append(pr_data) + comments.extend(issue_comments) + print(f"Progress: {i}/{total_prs} PRs processed") + + # Export + rand = str(datetime.now().strftime("%Y%m%d%H%M%S")) + pd.DataFrame(comments).to_csv(f"gh_stats/comments-prs-{rand}.csv", index=False) + pd.DataFrame(prs).to_csv(f"gh_stats/prs-{rand}.csv", index=False) + pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/users-prs-{rand}.csv", index=False) + +###################################################### +# GET DATA FOR COMMITS +###################################################### +if EXECUTE_COMMIT: + # Initialize lists (we will store output data here) + commits = [] + users = {} + + # Get commits + commits_raw = repo.get_commits() + total_commits = commits_raw.totalCount # Total number of commits for progress tracking + + # Retrieve relevant data (several API calls) + for i, c in enumerate(commits_raw): + if i % 10 == 0: + print(f"Progress: {i}/{total_commits} commits processed") + user = c.author + commit_raw = { + "sha": c.sha, + "date": c.commit.author.date.strftime("%Y-%m-%d %H:%M:%S"), + "files_changed": len(c.files), + "lines_changed": c.stats.total, + "lines_deleted": c.stats.deletions, + "lines_added": c.stats.additions, + "user_id": user.id, + } + commits.append(commit_raw) + # Add user + if user.id not in users: + users[user.id] = { + "user_login": user.login, + "user_name": user.name, + "user_location": user.location, + } + + if (i != 0) & (i % 50 == 0): + # Export + print(f"Exporting {i}...") + rand = str(datetime.now().strftime("%Y%m%d%H%M%S")) + pd.DataFrame(commits).to_csv(f"gh_stats/commits/{i}-commits-{rand}.csv", index=False) + pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/commits/{i}-users-commits-{rand}.csv", index=False) + + # Export + rand = str(datetime.now().strftime("%Y%m%d%H%M%S")) + pd.DataFrame(commits).to_csv(f"gh_stats/commits/total-commits-{rand}.csv", index=False) + pd.DataFrame(users).T.reset_index().to_csv(f"gh_stats/commits/total-users-commits-{rand}.csv", index=False) diff --git a/snapshots/covid/2024-11-05/github_stats.py b/snapshots/covid/2024-11-05/github_stats.py new file mode 100644 index 00000000000..d12eba07cac --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats.py @@ -0,0 +1,83 @@ +"""Script to create a snapshot of dataset. + +The data for these snapshots has been manually obtained via the GitHub API. We have obtained data on + +- Issues: Comments, users that participated, etc. +- Pull Requests: Comments, users that participated, etc. +- Commits: Users that participated, etc. + +If you want to retrieve this data again, please look at the script `get_stats.py` in the same folder. You can simply execute it. To run different parts of the script please use the variables at the top of the script EXECUTE_ISSUES, EXECUTE_PRS, EXECUTE_COMMIT. + + python snapshots/covid/2024-11-05/github_stats.py \ + --issues gh_stats/issues-20241104000000.csv \ + --issues-comments gh_stats/comments-issues-20241104000000.csv \ + --issues-users gh_stats/users-issues-20241104000000.csv \ + --pr gh_stats/prs-20241105104652.csv \ + --pr-comments gh_stats/comments-prs-20241105104652.csv \ + --pr-users gh_stats/users-prs-20241105104652.csv \ + --commits gh_stats/commits/8800-commits-20241105165504.csv \ + --commits-users gh_stats/commits/8800-users-commits-20241105165504.csv + +""" + +from pathlib import Path +from typing import Optional + +import click +from structlog import get_logger + +from etl.snapshot import Snapshot + +log = get_logger() + + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--issues", type=str, help="File with data on issues.") +@click.option("--issues-comments", type=str, help="File with data on issues comments.") +@click.option("--issues-users", type=str, help="File with data on users that commented in issues.") +@click.option("--pr", type=str, help="File with data on PRs.") +@click.option("--pr-comments", type=str, help="File with data on PR comments.") +@click.option("--pr-users", type=str, help="File with data on users that commented in PRs.") +@click.option("--commits", type=str, help="File with data on commits.") +@click.option("--commits-users", type=str, help="File with data on commit users.") +def main( + upload: bool, + issues: Optional[str] = None, + issues_comments: Optional[str] = None, + issues_users: Optional[str] = None, + pr: Optional[str] = None, + pr_comments: Optional[str] = None, + pr_users: Optional[str] = None, + commits: Optional[str] = None, + commits_users: Optional[str] = None, +) -> None: + snapshot_paths = [ + (issues, "github_stats_issues.csv"), + (issues_comments, "github_stats_issues_comments.csv"), + (issues_users, "github_stats_issues_users.csv"), + (pr, "github_stats_pr.csv"), + (pr_comments, "github_stats_pr_comments.csv"), + (pr_users, "github_stats_pr_users.csv"), + (commits, "github_stats_commits.csv"), + (commits_users, "github_stats_commits_users.csv"), + ] + + for paths in snapshot_paths: + if paths[0] is not None: + log.info(f"Importing {paths[1]}.") + # Create a new snapshot. + snap = Snapshot(f"covid/{SNAPSHOT_VERSION}/{paths[1]}") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=paths[0], upload=upload) + else: + log.warning(f"Skipping import for {paths[1]}.") + + +if __name__ == "__main__": + main() diff --git a/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc b/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc new file mode 100644 index 00000000000..303806e4300 --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_commits.csv.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Commits" + description_snapshot: |- + This snapshot contains the list of commits to the owid/covid-19-data GitHub repository. + + date_published: 2024-11-05 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 45236ca93183af7d057bb47afe37d715 + size: 725503 + path: github_stats_commits.csv diff --git a/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc b/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc new file mode 100644 index 00000000000..d9020f310d7 --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_commits_users.csv.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Users from Commits" + description_snapshot: |- + This snapshot contains the list of users that submitted commits to the owid/covid-19-data GitHub repository. + + date_published: 2024-11-05 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: ada1edd0b7fb873af437894bd7d13779 + size: 301 + path: github_stats_commits_users.csv diff --git a/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc b/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc new file mode 100644 index 00000000000..d89eec2ce40 --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_issues.csv.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Issues" + description_snapshot: |- + This snapshot contains the list of created issues on the owid/covid-19-data GitHub repository. + + date_published: 2024-11-05 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 8d8a60f7da5dfadfc22e9f76e79ef29f + size: 109219 + path: github_stats_issues.csv diff --git a/snapshots/covid/2024-11-05/github_stats_issues_comments.csv.dvc b/snapshots/covid/2024-11-05/github_stats_issues_comments.csv.dvc new file mode 100644 index 00000000000..3842e7bf23d --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_issues_comments.csv.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Comments on Issues" + description_snapshot: |- + This snapshot contains the list of comments in issues on the owid/covid-19-data GitHub repository. + + date_published: 2024-11-05 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 08b2b7c51f7ac9147ccf2d39e7460d99 + size: 284976 + path: github_stats_issues_comments.csv diff --git a/snapshots/covid/2024-11-05/github_stats_issues_users.csv.dvc b/snapshots/covid/2024-11-05/github_stats_issues_users.csv.dvc new file mode 100644 index 00000000000..79055164066 --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_issues_users.csv.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Users from Issues" + description_snapshot: |- + This snapshot contains the list of users that created issues on the owid/covid-19-data GitHub repository. + + date_published: 2024-11-05 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 305bd6080fdf3a0ea6ac89d1e4fdf681 + size: 14719 + path: github_stats_issues_users.csv diff --git a/snapshots/covid/2024-11-05/github_stats_pr.csv.dvc b/snapshots/covid/2024-11-05/github_stats_pr.csv.dvc new file mode 100644 index 00000000000..13eeb6a8770 --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_pr.csv.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Pull Requests" + description_snapshot: |- + This snapshot contains the list of created pull requests on the owid/covid-19-data GitHub repository. + + date_published: 2024-11-05 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 150fe7287532002345e75aec42a17e9c + size: 76283 + path: github_stats_pr.csv diff --git a/snapshots/covid/2024-11-05/github_stats_pr_comments.csv.dvc b/snapshots/covid/2024-11-05/github_stats_pr_comments.csv.dvc new file mode 100644 index 00000000000..61428fd6b98 --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_pr_comments.csv.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Comments on Pull Requests" + description_snapshot: |- + This snapshot contains the list of comments in pull requests on the owid/covid-19-data GitHub repository. + + date_published: 2024-11-05 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: c835ace4cc061aad7ddbb9c756991fe7 + size: 8126 + path: github_stats_pr_comments.csv diff --git a/snapshots/covid/2024-11-05/github_stats_pr_users.csv.dvc b/snapshots/covid/2024-11-05/github_stats_pr_users.csv.dvc new file mode 100644 index 00000000000..1d1da07e658 --- /dev/null +++ b/snapshots/covid/2024-11-05/github_stats_pr_users.csv.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: GitHub stats on owid/covid-19-data repository + description: |- + During the global COVID-19 pandemic, Our World in Data had a key role in collecting, disseminating, and publishing various indicators. These included figures on vaccinations, testing, confirmed deaths and cases, and more. + + This work was done in a GitHub repository, in the public eye. Our World in Data received numerous contributions from other actors who voluntarily helped collect all the data. Without their contribution, creating the dataset wouldn't have been possible. + title_snapshot: "GitHub stats on owid/covid-19-data repository: Users from Pull Requests" + description_snapshot: |- + This snapshot contains the list of users that created pull requests on the owid/covid-19-data GitHub repository. + + date_published: 2024-11-05 + + # Citation + producer: Our World in Data + citation_full: |- + Our World in Data. (2024). COVID-19 Data. GitHub repository. https://github.com/owid/covid-19-data. + + # Files + url_main: https://github.com/owid/covid-19-data + date_accessed: 2024-11-05 + + # License + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ + +outs: + - md5: 6592cf7306251d5a094d478ab7df58b6 + size: 803 + path: github_stats_pr_users.csv From 1f5eaa51a9366a4d00226237d81faf4a762fb2c9 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 11 Dec 2024 18:33:01 +0100 Subject: [PATCH 3/6] clean --- etl/steps/export/multidim/covid/latest/covid.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/etl/steps/export/multidim/covid/latest/covid.py b/etl/steps/export/multidim/covid/latest/covid.py index 694de5cd58d..52a339c82c8 100644 --- a/etl/steps/export/multidim/covid/latest/covid.py +++ b/etl/steps/export/multidim/covid/latest/covid.py @@ -35,8 +35,6 @@ def run(dest_dir: str) -> None: config["views"] += multidim.expand_views(config, {"place": "*"}, table, engine) # type: ignore multidim.upsert_multidim_data_page(slug, config, engine) - print(1) - def fname_to_slug(fname: str) -> str: return f"mdd-{fname.replace('.yml', '').replace('.', '-').replace('_', '-')}" From 42cead464347f1ab06401acbeff4f1cbe75668ed Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 12 Dec 2024 12:19:38 +0100 Subject: [PATCH 4/6] add schema --- etl/steps/export/multidim/covid/latest/covid.cases.yml | 1 + etl/steps/export/multidim/covid/latest/covid.cases_tests.yml | 2 ++ etl/steps/export/multidim/covid/latest/covid.covax.yml | 4 ++++ etl/steps/export/multidim/covid/latest/covid.deaths.yml | 1 + etl/steps/export/multidim/covid/latest/covid.hospital.yml | 2 +- etl/steps/export/multidim/covid/latest/covid.mobility.yml | 1 + etl/steps/export/multidim/covid/latest/covid.models.yml | 4 ++++ .../export/multidim/covid/latest/covid.vax_breakdowns.yml | 3 ++- etl/steps/export/multidim/covid/latest/covid.xm_models.yml | 5 ++++- 9 files changed, 20 insertions(+), 3 deletions(-) diff --git a/etl/steps/export/multidim/covid/latest/covid.cases.yml b/etl/steps/export/multidim/covid/latest/covid.cases.yml index c939236f9d4..2f3c4fe5ee6 100644 --- a/etl/steps/export/multidim/covid/latest/covid.cases.yml +++ b/etl/steps/export/multidim/covid/latest/covid.cases.yml @@ -7,6 +7,7 @@ definitions: entity: true time: true changeInPrefix: true + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: title: COVID-19 confirmed cases diff --git a/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml b/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml index 9ca21fa345e..926657085bc 100644 --- a/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml +++ b/etl/steps/export/multidim/covid/latest/covid.cases_tests.yml @@ -24,6 +24,7 @@ views: x: grapher/covid/latest/cases_deaths/cases_deaths#new_cases_7_day_avg_right config: chartTypes: ["ScatterPlot"] + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json - dimensions: normalize: per_capita indicators: @@ -31,6 +32,7 @@ views: x: grapher/covid/latest/cases_deaths/cases_deaths#new_cases_per_million_7_day_avg_right color: 123 config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json chartTypes: ["ScatterPlot"] map: colorScale: diff --git a/etl/steps/export/multidim/covid/latest/covid.covax.yml b/etl/steps/export/multidim/covid/latest/covid.covax.yml index c6fcfaa58c0..c8690e62d11 100644 --- a/etl/steps/export/multidim/covid/latest/covid.covax.yml +++ b/etl/steps/export/multidim/covid/latest/covid.covax.yml @@ -70,6 +70,7 @@ views: chartTypes: ["StackedDiscreteBar"] sortBy: column sortColumnSlug: "{definitions.table}#delivered" + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json dimensions: - property: y variableId: 988127 @@ -101,6 +102,7 @@ views: note: COVAX is a worldwide initiative aimed at equitable access to COVID-19 vaccines. It is directed by Gavi, CEPI, and the WHO. Gross domestic product is expressed in U.S. Dollars; it is sourced from the World Bank and OECD. originUrl: ourworldindata.org/coronavirus chartTypes: ["StackedDiscreteBar"] + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json - dimensions: normalize: per_dose @@ -116,6 +118,7 @@ views: note: COVAX is a worldwide initiative aimed at equitable access to COVID-19 vaccines. It is directed by Gavi, CEPI, and the WHO. Gross domestic product is expressed in U.S. Dollars; it is sourced from the World Bank and OECD. originUrl: ourworldindata.org/coronavirus chartTypes: ["StackedDiscreteBar"] + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json - dimensions: normalize: per_gdp @@ -131,3 +134,4 @@ views: note: COVAX is a worldwide initiative aimed at equitable access to COVID-19 vaccines. It is directed by Gavi, CEPI, and the WHO. Gross domestic product is expressed in U.S. Dollars; it is sourced from the World Bank and OECD. originUrl: ourworldindata.org/coronavirus chartTypes: ["StackedDiscreteBar"] + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json diff --git a/etl/steps/export/multidim/covid/latest/covid.deaths.yml b/etl/steps/export/multidim/covid/latest/covid.deaths.yml index 42d4e5f3dbc..f57ea1b2b6e 100644 --- a/etl/steps/export/multidim/covid/latest/covid.deaths.yml +++ b/etl/steps/export/multidim/covid/latest/covid.deaths.yml @@ -1,6 +1,7 @@ definitions: table: grapher/covid/latest/cases_deaths/cases_deaths config: &config + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json tab: map originUrl: ourworldindata.org/coronavirus hideAnnotationFieldsInTitle: diff --git a/etl/steps/export/multidim/covid/latest/covid.hospital.yml b/etl/steps/export/multidim/covid/latest/covid.hospital.yml index 6cf63899fa0..58ec5ec722b 100644 --- a/etl/steps/export/multidim/covid/latest/covid.hospital.yml +++ b/etl/steps/export/multidim/covid/latest/covid.hospital.yml @@ -1,13 +1,13 @@ definitions: table: grapher/covid/latest/hospital/hospital config: &config + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json hideAnnotationFieldsInTitle: entity: true time: true changeInPrefix: true originUrl: ourworldindata.org/coronavirus - title: title: COVID-19 hospitalisations titleVariant: by type diff --git a/etl/steps/export/multidim/covid/latest/covid.mobility.yml b/etl/steps/export/multidim/covid/latest/covid.mobility.yml index 0da7e1301f7..9da7e7b7037 100644 --- a/etl/steps/export/multidim/covid/latest/covid.mobility.yml +++ b/etl/steps/export/multidim/covid/latest/covid.mobility.yml @@ -47,6 +47,7 @@ views: - "{definitions.table}#trend__place_workplaces" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "How did the number of visitors change since the beginning of the pandemic?" subtitle: This data shows how community movement in specific locations has changed relative to the period before the pandemic. note: It's not recommended to compare levels across countries; local differences in categories could be misleading. diff --git a/etl/steps/export/multidim/covid/latest/covid.models.yml b/etl/steps/export/multidim/covid/latest/covid.models.yml index ce5077614f9..9f384a805eb 100644 --- a/etl/steps/export/multidim/covid/latest/covid.models.yml +++ b/etl/steps/export/multidim/covid/latest/covid.models.yml @@ -38,6 +38,7 @@ views: - "grapher/covid/latest/cases_deaths/cases_deaths#new_cases_7_day_avg_right" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "Daily new estimated COVID-19 infections from the ICL model" subtitle: Estimates of the true number of infections. The "upper" and "lower" lines show the bounds of a 95% uncertainty interval. For comparison, confirmed cases are infections that have been confirmed with a test. note: This chart shows the model estimates dated 25 December 2022. @@ -56,6 +57,7 @@ views: - "grapher/covid/latest/cases_deaths/cases_deaths#new_cases_7_day_avg_right" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "Daily new estimated COVID-19 infections from the IHME model" subtitle: Estimates of the true number of infections. The "upper" and "lower" lines show the bounds of a 95% uncertainty interval. For comparison, confirmed cases are infections that have been confirmed with a test. note: This chart shows the model estimates dated 16 December 2022. @@ -74,6 +76,7 @@ views: - "grapher/covid/latest/cases_deaths/cases_deaths#new_cases_7_day_avg_right" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "Daily new estimated COVID-19 infections from the LSHTM model" subtitle: Estimates of the true number of infections. The "upper" and "lower" lines show the bounds of a 95% uncertainty interval. For comparison, confirmed cases are infections that have been confirmed with a test. note: This chart shows the model estimates dated 25 August 2020. @@ -92,6 +95,7 @@ views: - "grapher/covid/latest/cases_deaths/cases_deaths#new_cases_7_day_avg_right" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "Daily new estimated COVID-19 infections from the YYG model" subtitle: Estimates of the true number of infections. The "upper" and "lower" lines show the bounds of a 95% uncertainty interval. For comparison, confirmed cases are infections that have been confirmed with a test. note: This chart shows the model estimates dated 4 October 2020. YYG announced that this is the final model update. diff --git a/etl/steps/export/multidim/covid/latest/covid.vax_breakdowns.yml b/etl/steps/export/multidim/covid/latest/covid.vax_breakdowns.yml index 6c7458f0ea2..6f02057ee63 100644 --- a/etl/steps/export/multidim/covid/latest/covid.vax_breakdowns.yml +++ b/etl/steps/export/multidim/covid/latest/covid.vax_breakdowns.yml @@ -23,13 +23,13 @@ dimensions: name: Manufacturer description: null - views: - dimensions: breakdown: income_group indicators: y: "grapher/covid/latest/vaccinations_global/vaccinations_global#total_vaccinations_per_hundred" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json addCountryMode: "disabled" selectedEntityNames: - High-income countries @@ -55,6 +55,7 @@ views: - "{definitions.table}#total_vaccinations__vaccine_skycovione" - "{definitions.table}#total_vaccinations__vaccine_valneva" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json chartTypes: ["StackedArea"] selectedEntityNames: - European Union (27) diff --git a/etl/steps/export/multidim/covid/latest/covid.xm_models.yml b/etl/steps/export/multidim/covid/latest/covid.xm_models.yml index 22963d3d376..3f54a86c669 100644 --- a/etl/steps/export/multidim/covid/latest/covid.xm_models.yml +++ b/etl/steps/export/multidim/covid/latest/covid.xm_models.yml @@ -43,13 +43,13 @@ views: - "{definitions.indicator_confirmed}" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "Estimated cumulative excess deaths during COVID-19" subtitle: For countries that have not reported all-cause mortality data for a given week, an estimate is shown, with uncertainty interval. If reported data is available, that value only is shown. For comparison, cumulative confirmed COVID-19 deaths are shown. originUrl: ourworldindata.org/coronavirus hideAnnotationFieldsInTitle: time: true - # The Economist - dimensions: model: econ @@ -61,6 +61,7 @@ views: - "{definitions.indicator_confirmed}" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "Estimated cumulative excess deaths during COVID-19" subtitle: For countries that have not reported all-cause mortality data for a given week, an estimate is shown, with uncertainty interval. If reported data is available, that value only is shown. For comparison, cumulative confirmed COVID-19 deaths are shown. originUrl: ourworldindata.org/coronavirus @@ -78,6 +79,7 @@ views: - "{definitions.indicator_confirmed}" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "Estimated cumulative excess deaths during COVID-19" subtitle: For countries that have not reported all-cause mortality data for a given week, an estimate is shown, with uncertainty interval. If reported data is available, that value only is shown. For comparison, cumulative confirmed COVID-19 deaths are shown. originUrl: ourworldindata.org/coronavirus @@ -95,6 +97,7 @@ views: - "{definitions.indicator_confirmed}" config: + $schema: https://files.ourworldindata.org/schemas/grapher-schema.006.json title: "Estimated cumulative excess deaths during COVID-19" subtitle: For countries that have not reported all-cause mortality data for a given week, an estimate is shown, with uncertainty interval. If reported data is available, that value only is shown. For comparison, cumulative confirmed COVID-19 deaths are shown. originUrl: ourworldindata.org/coronavirus From 07979df50832876f62c8a71bb750bd31745039ed Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 12 Dec 2024 12:19:45 +0100 Subject: [PATCH 5/6] wip --- etl/steps/data/garden/covid/latest/cases_deaths.meta.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/covid/latest/cases_deaths.meta.yml b/etl/steps/data/garden/covid/latest/cases_deaths.meta.yml index d1264ec3394..48a65ecfc7d 100644 --- a/etl/steps/data/garden/covid/latest/cases_deaths.meta.yml +++ b/etl/steps/data/garden/covid/latest/cases_deaths.meta.yml @@ -208,9 +208,10 @@ tables: colorScale: customNumericColorsActive: true binningStrategy: manual - customNumericValues: [-200, -100, -50, -20, 0, 20, 50, 100, 200, 500] + customNumericValues: [-200, -100, -50, -20, 0, 20, 50, 100, 200, 1] colorSchemeInvert: true baseColorScheme: RdBu + customNumericMinValue: -1 display: numDecimalPlaces: 2 tolerance: 30 @@ -269,9 +270,10 @@ tables: colorScale: customNumericColorsActive: true binningStrategy: manual - customNumericValues: [-200, -100, -40, 0, 40, 100, 200, 400, 1000] + customNumericValues: [-500, -100, -50, 0, 50, 100, 500, 1] colorSchemeInvert: true baseColorScheme: RdBu + customNumericMinValue: -1 # Deaths ################ From f6d05cb74ef9f74cd464a68d1f9fa183a4a34b44 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 12 Dec 2024 15:07:30 +0100 Subject: [PATCH 6/6] improve description --- etl/steps/data/garden/covid/latest/cases_deaths.meta.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/covid/latest/cases_deaths.meta.yml b/etl/steps/data/garden/covid/latest/cases_deaths.meta.yml index 48a65ecfc7d..0c8677a074a 100644 --- a/etl/steps/data/garden/covid/latest/cases_deaths.meta.yml +++ b/etl/steps/data/garden/covid/latest/cases_deaths.meta.yml @@ -202,7 +202,7 @@ tables: presentation: grapher_config: title: Week by week change of confirmed COVID-19 cases - subtitle: The weekly growth rate on any given date measures the percentage change in number of confirmed cases over the last seven days relative to the number in the previous seven days. + subtitle: The weekly growth rate measures the percentage change in confirmed cases over the past seven days compared to the seven days before. hasMapTab: true map: colorScale: @@ -264,7 +264,7 @@ tables: presentation: grapher_config: title: Biweekly change of confirmed COVID-19 cases - subtitle: The biweekly growth rate on any given date measures the percentage change in the number of new confirmed cases over the last 14 days relative to the number in the previous 14 days. + subtitle: The biweekly growth rate measures the percentage change in confirmed cases over the past 14 days compared to the 14 days before. hasMapTab: true map: colorScale: