From 50e083f1d0732481bdd118470b81bff16257b5c5 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Thu, 18 Apr 2024 11:44:50 +0200 Subject: [PATCH 1/3] :sparkles: add --errors and --include/--exclude to chart-sync (#2540) * :sparkles: add --include and --exclude to chart-sync * :sparkles: add --exclude and --errors options to chart-sync * add --include --- apps/staging_sync/cli.py | 47 +++++++++++++++++++++++++++++++++++----- etl/grapher_model.py | 24 +++++++++++++++++++- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/apps/staging_sync/cli.py b/apps/staging_sync/cli.py index b10e0968b1a..f6173ed8481 100644 --- a/apps/staging_sync/cli.py +++ b/apps/staging_sync/cli.py @@ -2,7 +2,7 @@ import datetime as dt import re from pathlib import Path -from typing import Any, Dict, Optional, Set +from typing import Any, Dict, Literal, Optional, Set import click import pandas as pd @@ -53,6 +53,24 @@ help="""Staging server UTC creation date. It is used to warn about charts that have been updated in production. Default is branch creation date.""", ) +@click.option( + "--include", + default=None, + type=str, + help="""Include only charts with variables whose catalogPath matches the provided string.""", +) +@click.option( + "--exclude", + default=None, + type=str, + help="""Exclude charts with variables whose catalogPath matches the provided string.""", +) +@click.option( + "--errors", + default="raise", + type=click.Choice(["raise", "warn"]), + help="""How to handle errors when syncing charts. 'warn' will skip the chart and continue.""", +) @click.option( "--dry-run/--no-dry-run", default=False, @@ -66,6 +84,9 @@ def cli( publish: bool, approve_revisions: bool, staging_created_at: Optional[dt.datetime], + include: Optional[str], + exclude: Optional[str], + errors: Literal["warn", "raise"], dry_run: bool, ) -> None: """Sync Grapher charts and revisions from an environment to the main environment. @@ -145,12 +166,28 @@ def cli( _remove_nonexisting_column_slug(source_chart, source_session) try: - target_chart = source_chart.migrate_to_db(source_session, target_session) + target_chart = source_chart.migrate_to_db( + source_session, target_session, include=include, exclude=exclude + ) except ValueError as e: - if "variables.catalogPath not found in target" in str(e): - raise ValueError("ETL deploy hasn't finished yet. Check the repository.") from e + if errors == "warn": + log.warning("staging_sync.error", chart_id=chart_id, error=str(e)) + continue else: - raise e + if "variables.catalogPath not found in target" in str(e): + raise ValueError("ETL deploy hasn't finished yet. Check the repository.") from e + else: + raise e + + # exclude charts with variables whose catalogPath matches the provided string + if target_chart is None: + log.info( + "staging_sync.skip", + slug=source_chart.config["slug"], + reason="filtered by --include/--exclude", + chart_id=chart_id, + ) + continue # try getting chart with the same slug try: diff --git a/etl/grapher_model.py b/etl/grapher_model.py index 1f777c807bd..bf25fbd73a9 100644 --- a/etl/grapher_model.py +++ b/etl/grapher_model.py @@ -5,6 +5,7 @@ It has been slightly modified since then. """ import json +import re from datetime import date, datetime from pathlib import Path from typing import Annotated, Any, Dict, List, Literal, Optional, TypedDict, Union, get_args @@ -329,7 +330,13 @@ def load_chart_variables(self, session: Session) -> Dict[int, "Variable"]: return variables - def migrate_to_db(self, source_session: Session, target_session: Session) -> "Chart": + def migrate_to_db( + self, + source_session: Session, + target_session: Session, + include: Optional[str] = None, + exclude: Optional[str] = None, + ) -> Optional["Chart"]: """Remap variable ids from source to target session. Variable in source is uniquely identified by its catalogPath if available, or by name and datasetId otherwise. It is looked up by this identifier in the target session to get the new variable id. @@ -340,6 +347,21 @@ def migrate_to_db(self, source_session: Session, target_session: Session) -> "Ch assert self.id, "Chart must come from a database" source_variables = self.load_chart_variables(source_session) + # if chart contains a variable that is excluded, skip the whole chart + if exclude: + for source_var in source_variables.values(): + if source_var.catalogPath and re.search(exclude, source_var.catalogPath): + return None + + # a chart must contain at least one variable matching include, otherwise skip it + if include: + matching = False + for source_var in source_variables.values(): + if source_var.catalogPath and re.search(include, source_var.catalogPath): + matching = True + if not matching: + return None + remap_ids = {} for source_var_id, source_var in source_variables.items(): if source_var.catalogPath: From 59ba9d7b4b02ae70ef5ae1f8f7d75c612cba46fb Mon Sep 17 00:00:00 2001 From: Fiona Spooner Date: Thu, 18 Apr 2024 12:41:20 +0100 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=93=8A=20WHO/Global=20Polio=20Eradica?= =?UTF-8?q?tion=20Initiative=20polio=20datasets=20(#2502)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * adding snapshot - meadow - garden * adding garden and grapher steps * fudging the origins issue * adding some comments * adding historical polio cases from who web archive * garden step historical cases * adding historical data and using correction factor to estimate total cases * adding metadata * adding regions aggs * wip * adding gpei fasttrack and metadat * shuffling things about * adding certification snapshot * adding meadow step for certification * adding polio free countries from gpei * adding polio free countries * removing garden step polion status * adding population dependencies * adding per million and harmonizing historical data too * adding year of last wild polio case * wip * wip * updates * adding who regional certification * adding grapher step and ironing out some wrinkles * fixing some warnings * getting it going on grapher * changing dataset titles * add low risk * sorting out the surveillance variable * fixing missing arguments * not using ordinal for now * adding pablo arriagada's suggestions * adding pablo rosado's suggestions * fixing error --- dag/health.yml | 36 ++ .../polio_free_countries.countries.json | 217 ++++++++++++ .../2024-04-12/polio_free_countries.meta.yml | 30 ++ .../health/2024-04-12/polio_free_countries.py | 116 +++++++ .../who/2024-04-08/polio.countries.json | 213 ++++++++++++ .../2024-04-08/polio.excluded_countries.json | 7 + .../data/garden/who/2024-04-08/polio.meta.yml | 157 +++++++++ etl/steps/data/garden/who/2024-04-08/polio.py | 325 ++++++++++++++++++ .../polio_historical.countries.json | 196 +++++++++++ .../who/2024-04-09/polio_historical.meta.yml | 20 ++ .../garden/who/2024-04-09/polio_historical.py | 36 ++ .../health/2024-04-12/polio_free_countries.py | 21 ++ .../data/grapher/who/2024-04-08/polio.py | 32 ++ .../health/2024-04-12/polio_free_countries.py | 22 ++ .../meadow/health/2024-04-12/polio_status.py | 27 ++ .../data/meadow/who/2024-04-08/polio_afp.py | 32 ++ .../meadow/who/2024-04-09/polio_historical.py | 38 ++ .../2024-04-12/polio_free_countries.csv.dvc | 27 ++ .../health/2024-04-12/polio_free_countries.py | 32 ++ .../health/2024-04-12/polio_status.csv.dvc | 26 ++ snapshots/health/2024-04-12/polio_status.py | 56 +++ snapshots/who/2024-04-08/polio_afp.csv.dvc | 26 ++ snapshots/who/2024-04-08/polio_afp.py | 37 ++ snapshots/who/2024-04-09/polio_historical.py | 31 ++ .../who/2024-04-09/polio_historical.xls.dvc | 28 ++ 25 files changed, 1788 insertions(+) create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_free_countries.py create mode 100644 etl/steps/data/garden/who/2024-04-08/polio.countries.json create mode 100644 etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json create mode 100644 etl/steps/data/garden/who/2024-04-08/polio.meta.yml create mode 100644 etl/steps/data/garden/who/2024-04-08/polio.py create mode 100644 etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json create mode 100644 etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml create mode 100644 etl/steps/data/garden/who/2024-04-09/polio_historical.py create mode 100644 etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py create mode 100644 etl/steps/data/grapher/who/2024-04-08/polio.py create mode 100644 etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py create mode 100644 etl/steps/data/meadow/health/2024-04-12/polio_status.py create mode 100644 etl/steps/data/meadow/who/2024-04-08/polio_afp.py create mode 100644 etl/steps/data/meadow/who/2024-04-09/polio_historical.py create mode 100644 snapshots/health/2024-04-12/polio_free_countries.csv.dvc create mode 100644 snapshots/health/2024-04-12/polio_free_countries.py create mode 100644 snapshots/health/2024-04-12/polio_status.csv.dvc create mode 100644 snapshots/health/2024-04-12/polio_status.py create mode 100644 snapshots/who/2024-04-08/polio_afp.csv.dvc create mode 100644 snapshots/who/2024-04-08/polio_afp.py create mode 100644 snapshots/who/2024-04-09/polio_historical.py create mode 100644 snapshots/who/2024-04-09/polio_historical.xls.dvc diff --git a/dag/health.yml b/dag/health.yml index 7ade31c5ed1..320697744bb 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -512,3 +512,39 @@ steps: - data://garden/demography/2023-03-31/population data://grapher/health/2024-04-02/organ_donation_and_transplantation: - data://garden/health/2024-04-02/organ_donation_and_transplantation + + # Polio AFP surveillance + data://meadow/who/2024-04-08/polio_afp: + - snapshot://who/2024-04-08/polio_afp.csv + + # Polio historical data + data://meadow/who/2024-04-09/polio_historical: + - snapshot://who/2024-04-09/polio_historical.xls + data://garden/who/2024-04-09/polio_historical: + - data://meadow/who/2024-04-09/polio_historical + + # Combining polio datasets + data://garden/who/2024-04-08/polio: + - data://meadow/who/2024-04-08/polio_afp + - data://meadow/who/2024-04-09/polio_historical + - data://garden/wb/2023-04-30/income_groups + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population + - snapshot://fasttrack/latest/gpei.csv + - snapshot://health/2024-04-12/polio_status.csv + data://grapher/who/2024-04-08/polio: + - data://garden/who/2024-04-08/polio + + # Polio certification status + data://meadow/health/2024-04-12/polio_status: + - snapshot://health/2024-04-12/polio_status.csv + + # Polio free countries + data://meadow/health/2024-04-12/polio_free_countries: + - snapshot://health/2024-04-12/polio_free_countries.csv + data://garden/health/2024-04-12/polio_free_countries: + - data://meadow/health/2024-04-12/polio_status + - data://meadow/health/2024-04-12/polio_free_countries + - data://garden/regions/2023-01-01/regions + data://grapher/health/2024-04-12/polio_free_countries: + - data://garden/health/2024-04-12/polio_free_countries diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json new file mode 100644 index 00000000000..8af883c624a --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json @@ -0,0 +1,217 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guam": "Guam", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Libyan Arab Jamahiriya": "Libya", + "American Samoa": "American Samoa", + "Anguilla": "Anguilla", + "Bermuda": "Bermuda", + "Bolivia": "Bolivia", + "British Virgin Islands": "British Virgin Islands", + "Cayman Islands": "Cayman Islands", + "East Timor": "East Timor", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Guadeloupe": "Guadeloupe", + "Iran": "Iran", + "Martinique": "Martinique", + "Montserrat": "Montserrat", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "Puerto Rico": "Puerto Rico", + "Reunion": "Reunion", + "Saint Helena": "Saint Helena", + "Swaziland": "Eswatini", + "Tokelau": "Tokelau", + "Turkey": "Turkey", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Venezuela": "Venezuela", + "Wallis and Futuna": "Wallis and Futuna", + "Barabados": "Barbados", + "Cote d\u2019Ivoire": "Cote d'Ivoire", + "Democratic People\u2019s Rep. of Korea": "North Korea", + "Federated States of Micronesia": "Micronesia (country)", + "Former Yugoslav Rep. of Macedonia": "North Macedonia", + "Hong Kong, SAR": "Hong Kong", + "Lao People\u2019s Democratic Republic": "Laos", + "Macao, SAR": "Macao", + "Mariana Islands": "Northern Mariana Islands", + "Palestine N.A.": "Palestine", + "US Virgin Islands": "United States Virgin Islands", + "Democratic Republic of the Congo": "Democratic Republic of Congo" +} \ No newline at end of file diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml new file mode 100644 index 00000000000..18c6ab5715e --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml @@ -0,0 +1,30 @@ +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases + processing_level: minor + unit: "" + short_unit: "" + display: + numDecimalPlaces: 0 + +dataset: + update_period_days: 365 + +tables: + polio_free_countries: + variables: + latest_year_wild_polio_case: + title: Latest year of wild polio case + description_short: The most recent year in which a case of wild poliovirus was detected in a country. + unit: "" + display: + numDecimalPlaces: 0 + status: + title: Polio-free status + description_short: The status of a country in terms of polio eradication. + unit: "" + display: + numDecimalPlaces: 0 diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..f731727875c --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,116 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from itertools import product + +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr + +from etl.data_helpers.geo import harmonize_countries, list_members_of_region +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +LATEST_YEAR = 2023 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("polio_free_countries") + tb = ds_meadow["polio_free_countries"].reset_index() + + ##### Temporary fix - we remove West Bank and Gaza as there is both data for West Bank and Gaza _and_ Palestine N.A (national authority). + ##### I'm not sure how we should treat these but for now I will just stick with the entity that has the latest value, so Palestine N.A. + + tb = tb[tb["country"] != "West Bank and Gaza"] + ##### There are also two values for Somalia, I will drop the least recent one + tb = tb[~((tb["country"] == "Somalia") & (tb["year"] == "2000"))] + + # Loading the polio status data for WHO regions + ds_region_status = paths.load_dataset(short_name="polio_status", channel="meadow") + tb_region_status = ds_region_status["polio_status"].reset_index() + + # Loading in the regions table so we know which countries are in each WHO region + ds_regions = paths.load_dataset("regions") + tb_regions = ds_regions["regions"].reset_index() + who_regions = tb_regions[(tb_regions["defined_by"] == "who") & (tb_regions["region_type"] == "aggregate")] + + tb = harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + # Assign polio free countries. + tb = define_polio_free(tb, latest_year=LATEST_YEAR) + + tb = add_polio_region_certification(tb, tb_region_status, who_regions, ds_regions) + # Set an index and sort. + tb = tb.format() + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() + + +def add_polio_region_certification( + tb: Table, tb_region_status: Table, who_regions: Table, ds_regions: Dataset +) -> Table: + # Append "(WHO)" suffix to the "who_region" to match the region names in the who_regions table + tb_region_status["who_region"] = tb_region_status["who_region"].astype(str) + " (WHO)" + + # Correct mapping of regions to status updates by ensuring 'region' matches the modified 'who_region' entries + for region in who_regions["name"]: + # Generate country list for the current region + country_list = list_members_of_region(region=region, ds_regions=ds_regions) + if not country_list: + raise ValueError(f"No countries found for region {region}") + + # Find the year of certification for the current region + year_certified = tb_region_status.loc[tb_region_status["who_region"] == region, "year_certified_polio_free"] + + # Check if there is a valid year of certification + if not year_certified.empty and year_certified.notna().all(): + year_certified_int = int(year_certified.iloc[0]) + # Set the status for all relevant countries and years + tb.loc[ + (tb["country"].isin(country_list)) & (tb["year"] >= year_certified_int), "status" + ] = "WHO Region certified polio-free" + + return tb + + +def define_polio_free(tb: Table, latest_year: int) -> Table: + """Define the polio free countries table.""" + tb = tb.copy() + # Clean the data + tb["year"] = tb["year"].astype(str) + + # Drop countries with missing values explicitly copying to avoid setting on a slice warning + tb = tb[tb["year"] != "data not available"] + + # Change 'pre 1985' to 1984 and 'ongoing' to LATEST_YEAR + 1 + tb.loc[tb["year"] == "pre 1985", "year"] = "1984" + tb.loc[tb["year"] == "ongoing", "year"] = str(latest_year) + + tb["year"] = tb["year"].astype(int) + # Rename year to latest year + tb = tb.rename(columns={"year": "latest_year_wild_polio_case"}) + tb["year"] = latest_year + # Create a product of all countries and all years from 1910 to LATEST_YEAR + tb_prod = Table(product(tb["country"].unique(), range(1910, latest_year + 1)), columns=["country", "year"]) + tb_prod = tb_prod.copy_metadata(from_table=tb) + + # Define polio status based on the year comparison + tb_prod["status"] = tb_prod.apply( + lambda row: "Endemic" + if row["year"] < tb[tb["country"] == row["country"]]["latest_year_wild_polio_case"].min() + else "Polio-free (not certified)", + axis=1, + ) + # Merge the two tables + tb = pr.merge(tb, tb_prod, on=["country", "year"], how="right") + # Issues with status not having origins or source, not sure this is the best way to solve + tb["status"] = tb["status"].copy_metadata(tb["latest_year_wild_polio_case"]) + return tb diff --git a/etl/steps/data/garden/who/2024-04-08/polio.countries.json b/etl/steps/data/garden/who/2024-04-08/polio.countries.json new file mode 100644 index 00000000000..1a59d51c260 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio.countries.json @@ -0,0 +1,213 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahrain": "Bahrain", + "Bahamas": "Bahamas", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "China, Hong Kong SAR": "Hong Kong", + "China, Macao SAR": "Macao", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Moldova": "Moldova", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Reunion": "Reunion", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Serbia and Montenegro": "Serbia and Montenegro", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "West Bank and Gaza": "Palestine", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Libyan Arab Jamahiriya": "Libya", + "The former Yugoslav Republic of Macedonia": "North Macedonia", + "Timor Leste": "East Timor" +} \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json b/etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json new file mode 100644 index 00000000000..94f87976655 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json @@ -0,0 +1,7 @@ +[ + "CAREC", + "Carec", + "Pacific Island Countries", + "Pacific Island countries", + "Yugoslavia" +] \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml new file mode 100644 index 00000000000..9bab6dc40e2 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -0,0 +1,157 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases +# surveillance_sort: &surveillance_sort +# - Inadequate screening and testing +# - Inadequate screening +# - Inadequate testing +# - Adequate screening and testing +# - Low risk +# - nan + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + polio: + variables: + afp_cases: + title: Acute Flaccid Paralysis (AFP) cases + description_short: Number of new cases of acute flaccid paralysis (AFP) reported in a given year. + unit: cases + display: + numDecimalPlaces: 0 + afp_cases_per_million: + title: Acute Flaccid Paralysis (AFP) cases per million + description_short: Number of new cases of acute flaccid paralysis (AFP) reported in a given year per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + non_polio_afp_rate: + title: Non-Polio acute flaccid paralysis (afp) rate + description_short: The number of cases of acute flaccid paralysis (AFP) per 100,000 children aged 15 or under, not attributed to polio, that were detected and reported to the WHO. + description_key: + - Acute flaccid paralysis (AFP) surveillance is the gold standard for detecting cases of poliomyelitis. + - At least one case of non-polio AFP should be detected annually per 100,000 population aged less than 15 years. In endemic regions, to ensure even higher sensitivity, this rate should be two per 100 000. + unit: cases per 100,000 children + pct_adequate_stool_collection: + title: Adequate stool collection (%) + description_short: The share of acute flaccid paralysis (AFP) cases, where stool samples were tested for poliovirus and reported to the WHO. + unit: "%" + short_unit: "%" + pending: + title: Pending + description_short: The number of acute flaccid paralysis (AFP) cases pending classification by the WHO. + unit: cases + wild_poliovirus_cases: + title: Wild Poliovirus cases + description_short: "The number of cases of wild poliovirus detected in a given year" + unit: cases + display: + numDecimalPlaces: 0 + wild_poliovirus_cases_per_million: + title: Wild Poliovirus cases per million + description_short: The number of cases of wild poliovirus detected in a given year per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + cvdpv_cases: + title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases + description_short: The number of cases of circulating vaccine-derived poliovirus detected in a given year. + unit: cases + display: + numDecimalPlaces: 0 + cvdpv_cases_per_million: + title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases per million + description_short: The number of cases of circulating vaccine-derived poliovirus detected in a given year per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + compatibles: + title: Compatibles + unit: "" + footnote: + title: Footnote + unit: "" + correction_factor: + title: Correction factor + description_short: Correction factor applied to the number of reported polio cases to account for underreporting. + unit: factor + polio_surveillance_status: + title: Polio surveillance status + description_short: "The status of polio surveillance in a given country" + description_key: + - A country is considered to have adequate screening if it has a non-polio AFP rate of at least 2 per 100,000 children aged 15 or under. + - A country is considered to have adequate testing if it has a percentage adequate stool collection of at least 80%. + - Countries are labelled 'low risk' if they were considered low risk by the risk assessment carried out for the [2022 GPEI surveillance action plan.](https://polioeradication.org/wp-content/uploads/2022/05/GPSAP-2022-2024-EN.pdf) + unit: "" + #type: ordinal + #sort: *surveillance_sort + total_cases: + title: Total polio cases + description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases. + unit: cases + display: + numDecimalPlaces: 0 + total_cases_per_million: + title: Total polio cases per million + description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + estimated_cases: + title: Estimated polio cases + description_short: The total estimated number of polio cases. + description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al. (2010). + unit: cases + display: + numDecimalPlaces: 0 + estimated_cases_per_million: + title: Estimated polio cases per million + description_short: The total estimated number of polio cases per million people. + description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al. (2010). + unit: cases per million + display: + numDecimalPlaces: 1 + cvdpv1: + title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV1) cases + description_short: The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year. + unit: cases + display: + numDecimalPlaces: 0 + cvdpv1_per_million: + title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV2) cases per million + description_short: The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year, per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + cvdpv2: + title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases + description_short: The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year. + unit: cases + display: + numDecimalPlaces: 0 + cvdpv2_per_million: + title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases per million + description_short: The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year, per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + cvdpv3: + title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases + description_short: The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year. + unit: cases + display: + numDecimalPlaces: 0 + cvdpv3_per_million: + title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases per million + description_short: The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year, per million people. + unit: cases per million + display: + numDecimalPlaces: 1 diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py new file mode 100644 index 00000000000..fb1b21979e8 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -0,0 +1,325 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr + +from etl.data_helpers.geo import add_regions_to_table, harmonize_countries, list_members_of_region +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Year to use for the screening and testing rates. +# Should be the most recent year of complete data. +SCREENING_YEAR = 2023 + +REGIONS = ["North America", "South America", "Europe", "Africa", "Asia", "Oceania", "World"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow acute flaccid paralysis dataset. + ds_meadow = paths.load_dataset("polio_afp") + # Load historical polio dataset + ds_historical = paths.load_dataset("polio_historical") + # Load population data to calculate cases per million population + ds_population = paths.load_dataset("population") + tb_population = ds_population["population"].reset_index() + # Load fasttrack Global Polio Eradication Initiative on circulating vaccine derived polio cases + snap_cvdpv = paths.load_snapshot("gpei.csv") + tb_cvdpv = snap_cvdpv.read() + # Dropping this as the total_cvdpv is also in the polio_afp table and has more historical data + tb_cvdpv = tb_cvdpv.drop(columns=["total_cvdpv"]) + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + tb_regions = ds_regions["regions"].reset_index() + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # Read table from meadow dataset. + tb = ds_meadow["polio_afp"].reset_index() + tb_hist = ds_historical["polio_historical"].reset_index() + tb_hist = tb_hist.rename(columns={"cases": "total_cases"}) + # Only need this for data prior to 2001 + tb_hist = tb_hist[tb_hist["year"] < 2001] + + # Remove data from before 2001. + tb = remove_pre_2001_data(tb) + # Remove values > 100% for "Adequate stool collection". + tb = clean_adequate_stool_collection(tb) + # Add total cases + tb["total_cases"] = tb["wild_poliovirus_cases"] + tb["cvdpv_cases"] + # Need to deal with overlapping years + tb = pr.concat([tb_hist, tb], axis=0) + tb = harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + tb = tb.merge(tb_cvdpv, on=["country", "year"], how="left") + # Add region aggregates. + tb_reg = add_regions_to_table( + tb[ + [ + "country", + "year", + "afp_cases", + "wild_poliovirus_cases", + "cvdpv_cases", + "total_cases", + "cvdpv1", + "cvdpv2", + "cvdpv3", + ] + ], + regions=REGIONS, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + min_num_values_per_year=1, + ) + tb_reg = tb_reg[tb_reg["country"].isin(REGIONS)] + tb = pr.concat([tb, tb_reg], axis=0) + # Add correction factor to estimate polio cases based on reported cases. + tb = add_correction_factor(tb) + tb["estimated_cases"] = tb["total_cases"] * tb["correction_factor"] + # Add polio surveillance status based on the screening and testing rates. + tb = add_screening_and_testing(tb, tb_regions, ds_regions) + tb = add_cases_per_million(tb, tb_population) + tb = tb.format(short_name="polio") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def add_cases_per_million(tb: Table, tb_population: Table) -> Table: + """ + Add cases per million population for each country, for the columns concerning each type of polio cases. + """ + tb_population = tb_population[["country", "year", "population"]] + tb = tb.merge(tb_population, on=["country", "year"], how="left") + + cols_to_divide = [ + "afp_cases", + "wild_poliovirus_cases", + "cvdpv_cases", + "total_cases", + "estimated_cases", + "cvdpv1", + "cvdpv2", + "cvdpv3", + ] + for col in cols_to_divide: + tb[f"{col}_per_million"] = tb[col] / tb["population"] * 1_000_000 + + tb = tb.drop(columns=["population"]) + return tb + + +def list_of_who_countries(tb_regions: Table, ds_regions: Dataset) -> list: + """List of countries as defined by WHO.""" + who_countries = [] + who_regions = tb_regions[tb_regions["defined_by"] == "who"] + for region in who_regions["name"]: + country_list = list_members_of_region(region=region, ds_regions=ds_regions) + if not country_list: + raise ValueError(f"No countries found for region {region}") + who_countries.extend(country_list) + return who_countries + + +def identify_low_risk_countries(tb: Table, tb_regions: Table, ds_regions: Dataset) -> Table: + # Identify low-risk countries (where the surveillance status can be disregarded) + # High risk entities are those identified in the table on page 48 in this document: https://polioeradication.org/wp-content/uploads/2022/04/GPSAP-2022-2024-EN.pdf + higher_risk_entities = [ + "Chad", + "Democratic Republic of Congo", + "Ethiopia", + "Niger", + "Nigeria", + "Afghanistan", + "Pakistan", + "Somalia", + "Angola", + "Burkina Faso", + "Cameroon", + "Central African Republic", + "Guinea", + "Kenya", + "Mali", + "South Sudan", + "Yemen", + "Benin", + "Cote d'Ivoire", + "Equatorial Guinea", + "Guinea-Bissau", + "Madagascar", + "Mozambique", + "Togo", + "Iraq", + "Sudan", + "Syria", + "Myanmar", + "Papua New Guinea", + "Philippines", + "Burundi", + "Congo", + "Gabon", + "Gambia", + "Ghana", + "Liberia", + "Senegal", + "Sierra Leone", + "Uganda", + "Zambia", + "Djibouti", + "Egypt", + "Iran", + "Libya", + "Tajikistan", + "Ukraine", + "Indonesia", + "Nepal", + "Haiti", + "Laos", + "China", + "Eritrea", + "Malawi", + "Mauritania", + "Namibia", + "Rwanda", + "Tanzania", + "Zimbabwe", + "Lebanon", + "Bangladesh", + "India", + "East Timor", + "Bolivia", + "Cambodia", + "Malaysia", + ] + + difference = [item for item in higher_risk_entities if item not in tb["country"].unique()] + assert difference == [], f"Entities in the high-risk list that are not in the dataset: {difference}" + + # Define the condition for which countries are not in high-risk entities + not_high_risk = ~tb["country"].isin(higher_risk_entities) + + # Define the condition for screening year + is_screening_year = tb["year"] == SCREENING_YEAR + + # Combine conditions and update 'polio_surveillance_status' for matching rows + tb.loc[not_high_risk & is_screening_year, "polio_surveillance_status"] = "Low risk" + + return tb + + +def add_screening_and_testing(tb: Table, tb_regions: Dataset, ds_regions: Dataset) -> Table: + """ + Adds the polio surveillance status based on the screening and testing rates. + For use in this chart: https://ourworldindata.org/grapher/polio-screening-and-testing + + Parameters: + - tb: table containing polio surveillance data. + - year: Specific year to filter the data. If None, uses current year. + + Returns: + - Modified table with a new column for polio surveillance status. + """ + # Ensuring we have all the countries in the WHO regions - even if there isn't other polio data for them + who_countries = list_of_who_countries(tb_regions, ds_regions) + who_tb = Table({"country": who_countries, "year": SCREENING_YEAR}).copy_metadata(from_table=tb) + tb = tb.merge(who_tb, on=["country", "year"], how="outer") + + # Add the polio surveillance status based on the screening and testing rates + tb.loc[ + (tb["non_polio_afp_rate"] >= 2.0) + & (tb["pct_adequate_stool_collection"] >= 80) + & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Adequate screening and testing" + tb.loc[ + (tb["non_polio_afp_rate"] >= 2.0) & (tb["pct_adequate_stool_collection"] < 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate testing" + tb.loc[ + (tb["non_polio_afp_rate"] < 2.0) & (tb["pct_adequate_stool_collection"] >= 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate screening" + tb.loc[ + (tb["non_polio_afp_rate"] < 2.0) & (tb["pct_adequate_stool_collection"] < 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate screening and testing" + + tb = identify_low_risk_countries(tb, tb_regions, ds_regions) + # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise + tb["polio_surveillance_status"] = tb["polio_surveillance_status"].copy_metadata(tb["non_polio_afp_rate"]) + return tb + + +def add_correction_factor(tb: Table) -> Table: + """ + Adding the correction factor to estimate polio cases based on reported cases. + + Following Tebbens et al (2011) -https://www.sciencedirect.com/science/article/pii/S0264410X10014957?via%3Dihub + + The correction factor is 7 for all years before 1996. + The correction factor is 1.11 for all countries when 1996 >= year <= 2000 if the 'non_polio_afp_rate' is < 1 OR 'percent_adequate_stool_collection' < 60, then the correction factor = 7. + If the 'non_polio_afp_rate' is < 2 OR 'percent_adequate_stool_collection' < 80, then the correction factor = 2. If the 'non_polio_afp_rate' is >= 2 OR 'percent_adequate_stool_collection' >= 80, then the correction factor = 1.11. + If both 'non_polio_afp_rate' and 'percent_adequate_stool_collection' are missing then the correction factor is 7. + + There are some manual changes we make: + + - Namibia had 'percent_adequate_stool_collection' > 100 in 2011 and 2014 but for other years it's correction factor is 1.11 so we set it as 1.11 for 2011 and 2014. + + - For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988. + + (We set the correction factor as NA for all of 2021 as the values of 'percent_adequate_stool_collection' seemed unreliable in this year.) + + """ + # tb["correction_factor"] = pd.NA + # Correction factor for years 1996-2000 is 1.11. + tb.loc[(tb["year"] >= 1996) & (tb["year"] <= 2000), "correction_factor"] = 1.11 + # If the 'non_polio_afp_rate' is < 1 OR 'percent_adequate_stool_collection' < 60, then the correction factor = 7. + tb.loc[(tb["non_polio_afp_rate"] < 1.0) | (tb["pct_adequate_stool_collection"] < 60), "correction_factor"] = 7.0 + # If the 'non_polio_afp_rate' is < 2 OR 'percent_adequate_stool_collection' < 80, then the correction factor = 2. + tb.loc[(tb["non_polio_afp_rate"] < 2.0) | (tb["pct_adequate_stool_collection"] < 80), "correction_factor"] = 2.0 + # If the 'non_polio_afp_rate' is >= 2 OR 'percent_adequate_stool_collection' >= 80, then the correction factor = 1.11. + tb.loc[(tb["non_polio_afp_rate"] >= 2.0) & (tb["pct_adequate_stool_collection"] >= 80), "correction_factor"] = 1.11 + # If both 'non_polio_afp_rate' and 'percent_adequate_stool_collection' are missing then the correction factor is 7. + tb.loc[(tb["non_polio_afp_rate"].isna()) & (tb["pct_adequate_stool_collection"].isna()), "correction_factor"] = 7.0 + # Correction factor for years before 1996 is 7. + tb.loc[tb["year"] < 1996, "correction_factor"] = 7.0 + + # tb.loc[tb["year"] == 2021, "correction_factor"] = np.nan + + # Namibia had 'percent_adequate_stool_collection' > 100 in 2011 and 2014 but for other years it's correction factor is 1.11 so we set it as 1.11 for 2011 and 2014. + tb.loc[(tb["country"] == "Namibia") & (tb["year"].isin([2011, 2014])), "correction_factor"] = 1.11 + # For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988. + tb.loc[(tb["country"] == "China") & (tb["year"].isin([1989, 1990, 1991, 1992])), "correction_factor"] = 1.11 + tb.loc[(tb["country"] == "Oman") & (tb["year"].isin([1988])), "correction_factor"] = 1.11 + # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise + tb["correction_factor"].metadata.origins = tb["non_polio_afp_rate"].metadata.origins + return tb + + +def clean_adequate_stool_collection(tb: Table) -> Table: + """ + Some values for "Adequate stool collection" are over 100%, we should set these to NA. + """ + tb.loc[tb["pct_adequate_stool_collection"] > 100, "pct_adequate_stool_collection"] = pd.NA + return tb + + +def remove_pre_2001_data(tb: Table) -> Table: + """Remove data from before 2001.""" + tb = tb[tb["year"] >= 2001].reset_index(drop=True) + return tb diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json b/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json new file mode 100644 index 00000000000..383f9f88205 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json @@ -0,0 +1,196 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe" +} \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml b/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml new file mode 100644 index 00000000000..2882d8f9b3c --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml @@ -0,0 +1,20 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + polio_historical: + variables: + cases: + title: Polio cases + description_short: Number of new cases of polio reported in a given year. + unit: cases diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.py b/etl/steps/data/garden/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..24975e73222 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.py @@ -0,0 +1,36 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("polio_historical") + + # Read table from meadow dataset. + tb = ds_meadow["polio_historical"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py b/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..b4ae4f6b3b0 --- /dev/null +++ b/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,21 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("polio_free_countries") + tb = ds_garden["polio_free_countries"] + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/who/2024-04-08/polio.py b/etl/steps/data/grapher/who/2024-04-08/polio.py new file mode 100644 index 00000000000..89a65e7d6af --- /dev/null +++ b/etl/steps/data/grapher/who/2024-04-08/polio.py @@ -0,0 +1,32 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("polio") + + # Read table from garden dataset. + tb = ds_garden["polio"] + + # + # Process data. + # + tb = tb.drop(columns="footnote") + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..9d808e0ffef --- /dev/null +++ b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,22 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("polio_free_countries.csv") + tb = snap.read() + tb = tb.format() + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_status.py b/etl/steps/data/meadow/health/2024-04-12/polio_status.py new file mode 100644 index 00000000000..b44471e4445 --- /dev/null +++ b/etl/steps/data/meadow/health/2024-04-12/polio_status.py @@ -0,0 +1,27 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("polio_status.csv") + tb = snap.read() + + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(keys=["who_region", "year_certified_polio_free"]) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/who/2024-04-08/polio_afp.py b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py new file mode 100644 index 00000000000..e7ba26edcf6 --- /dev/null +++ b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("polio_afp.csv") + + # Load data from snapshot. + tb = snap.read() + tb = tb.rename(columns={"Country / Territory / Region": "country", "Year": "year"}) + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/who/2024-04-09/polio_historical.py b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..684655a90b7 --- /dev/null +++ b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py @@ -0,0 +1,38 @@ +"""Load a snapshot and create a meadow dataset.""" + +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("polio_historical.xls") + + # Load data from snapshot. + tb = snap.read(sheet_name="Polio") + + # + # Process data. + # + tb = pr.melt(tb, id_vars=["WHO_REGION", "ISO_code", "Cname", "Disease"], var_name="year", value_name="cases") + tb = tb.drop(columns=["WHO_REGION", "ISO_code", "Disease"], errors="raise") + tb = tb.rename(columns={"Cname": "country"}, errors="raise") + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc new file mode 100644 index 00000000000..e24468c687e --- /dev/null +++ b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Polio-Free Countries + description: |- + This dataset shows the last recorded case of indigenous wild poliovirus (WPV) by country. As data methodology and quality varied widely across regions and countries in earlier years, this table is based on the best-available sources for the years before 2000. + date_published: "2023" + # Citation + producer: Global Polio Eradication Initiative + citation_full: |- + Global Polio Eradication Initiative (2023) + + # Files + url_main: https://polioeradication.org/where-we-work/polio-free-countries/ + date_accessed: 2024-04-12 + + # License + license: + name: CC BY-NC-SA 3.0 IGO + url: https://polioeradication.org/terms-of-use/ + +outs: + - md5: 3670959ed02bcdb84fc0080734fc1bf7 + size: 5123 + path: polio_free_countries.csv diff --git a/snapshots/health/2024-04-12/polio_free_countries.py b/snapshots/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..74d8e6c0646 --- /dev/null +++ b/snapshots/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,32 @@ +"""Script to create a snapshot of dataset. + +The data is from this page: https://polioeradication.org/where-we-work/polio-free-countries/ + +The table was copied into a csv and rearranged so that it only has two columns, country and year. + +Then this was uploaded to snapshot. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"health/{SNAPSHOT_VERSION}/polio_free_countries.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/health/2024-04-12/polio_status.csv.dvc b/snapshots/health/2024-04-12/polio_status.csv.dvc new file mode 100644 index 00000000000..6a1897c27ef --- /dev/null +++ b/snapshots/health/2024-04-12/polio_status.csv.dvc @@ -0,0 +1,26 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Global Polio Eradication Initiative - Certification Status + date_published: "2021" + # Citation + producer: Global Polio Eradication Initiative + citation_full: |- + Global Polio Eradication Initiative (2024) + attribution: Global Polio Eradication Initiative (2024) + attribution_short: GPEI + + # Files + url_main: https://polioeradication.org/ + date_accessed: 2024-04-12 + + # License + license: + name: https://polioeradication.org/terms-of-use/ + url: https://polioeradication.org/terms-of-use/ +outs: + - md5: e9052b2095a1c01afe2f954eda183344 + size: 140 + path: polio_status.csv diff --git a/snapshots/health/2024-04-12/polio_status.py b/snapshots/health/2024-04-12/polio_status.py new file mode 100644 index 00000000000..f0c03782c79 --- /dev/null +++ b/snapshots/health/2024-04-12/polio_status.py @@ -0,0 +1,56 @@ +"""Script to create a snapshot of dataset. + +Data are transcribed from this webpage: + +https://polioeradication.org/polio-today/preparing-for-a-polio-free-world/certification/ + +""" + +from pathlib import Path + +import click +import pandas as pd +from owid.datautils.io import df_to_file + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"health/{SNAPSHOT_VERSION}/polio_status.csv") + + df = pd.DataFrame( + data={ + "who_region": [ + "Africa", + "Americas", + "South-East Asia", + "Europe", + "Eastern Mediterranean", + "Western Pacific", + ], + "year_certified_polio_free": [ + 2020, + 1994, + 2014, + 2002, + pd.NA, + 2000, + ], + } + ) + snap.create_snapshot(data=df, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/who/2024-04-08/polio_afp.csv.dvc b/snapshots/who/2024-04-08/polio_afp.csv.dvc new file mode 100644 index 00000000000..2ee7ee44d99 --- /dev/null +++ b/snapshots/who/2024-04-08/polio_afp.csv.dvc @@ -0,0 +1,26 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Polio Cases and Acute Flaccid Paralysis (AFP) Surveillance + date_published: "2024-04-04" + + # Citation + producer: World Health Organization + citation_full: |- + World Health Organization - Polio cases (2024). + attribution_short: WHO + # Files + url_main: https://extranet.who.int/polis/public/CaseCount.aspx + date_accessed: 2024-04-08 + + # License + license: + name: CC BY-NC-SA 3.0 IGO + url: https://www.who.int/about/policies/publishing/copyright + +outs: + - md5: 00d57ac66f4507ae66a35fecec365971 + size: 156591 + path: polio_afp.csv diff --git a/snapshots/who/2024-04-08/polio_afp.py b/snapshots/who/2024-04-08/polio_afp.py new file mode 100644 index 00000000000..aae52da6c33 --- /dev/null +++ b/snapshots/who/2024-04-08/polio_afp.py @@ -0,0 +1,37 @@ +"""Script to create a snapshot of dataset. + +To find the data needed to run this step following these steps: + + - Go to https://extranet.who.int/polis/public/CaseCount.aspx + - Select 'World' in the Region list + - Select all countries in the year of onset list (you may need to use cmd+a to do this) + - Ensure the 'Country Detail' box is checked + - Click 'Show data' + - Select the outputted table and copy it to a CSV file, e.g. in excel + - This is the local file to be loaded in the snapshot + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"who/{SNAPSHOT_VERSION}/polio_afp.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/who/2024-04-09/polio_historical.py b/snapshots/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..bac3f8c044c --- /dev/null +++ b/snapshots/who/2024-04-09/polio_historical.py @@ -0,0 +1,31 @@ +"""Script to create a snapshot of dataset. + +The data is no longer available from the WHO but it is available on web archive e.g. here "https://web.archive.org/web/20200713223806/http://www.who.int/immunization/monitoring_surveillance/data/incidence_series.xls" + +It can be downloaded and then used to create a snapshot from the local file. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"who/{SNAPSHOT_VERSION}/polio_historical.xls") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/who/2024-04-09/polio_historical.xls.dvc b/snapshots/who/2024-04-09/polio_historical.xls.dvc new file mode 100644 index 00000000000..10ab583cea4 --- /dev/null +++ b/snapshots/who/2024-04-09/polio_historical.xls.dvc @@ -0,0 +1,28 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: WHO Historical Polio Dataset + date_published: "2019-12-10" + description: |- + The World Health Organization (WHO) provides a historical dataset on polio cases, but it is no longer available directly from the WHO website. Instead the web archived version can be downloaded [here](https://web.archive.org/web/20200101000000*/http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1) and it can be accessed by opening in 'Read Only' mode. + + # Citation + producer: World Health Organization + citation_full: |- + World Health Organization - Historical Polio Dataset (2019). + attribution_short: WHO + # Files + url_main: https://www.who.int/news-room/fact-sheets/detail/poliomyelitis + date_accessed: 2024-04-09 + + # License + license: + name: CC BY-NC-SA 3.0 IGO + url: https://www.who.int/about/policies/publishing/copyright + +outs: + - md5: 189201470a046c95b5f38c05a77fd6c2 + size: 612864 + path: polio_historical.xls From 0ac820e498250b84939a051d3656cca15abb2cb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucas=20Rod=C3=A9s-Guirao?= Date: Thu, 18 Apr 2024 11:27:53 -0400 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=93=8A=20data:=20undp=202024=20(#2504?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * wip: undp * undp: grapher, metadata * snapshot * chart-upgrader: drop rows with old=new, informative msg * wizard: chart-upgrader improve explore mode * archive undp hdr old version * move step for clarity * integrate Pablo's suggestions * remove debugging code --- apps/wizard/pages/charts/variable_config.py | 39 ++ dag/archive/main.yml | 9 + dag/main.yml | 13 +- .../un/2024-04-09/undp_hdr.countries.json | 208 +++++++ .../garden/un/2024-04-09/undp_hdr.meta.yml | 548 ++++++++++++++++++ .../data/garden/un/2024-04-09/undp_hdr.py | 62 ++ .../data/grapher/un/2024-04-09/undp_hdr.py | 32 + .../data/meadow/un/2024-04-09/undp_hdr.py | 32 + snapshots/un/2024-04-09/undp_hdr.csv.dvc | 35 ++ snapshots/un/2024-04-09/undp_hdr.py | 24 + 10 files changed, 995 insertions(+), 7 deletions(-) create mode 100644 etl/steps/data/garden/un/2024-04-09/undp_hdr.countries.json create mode 100644 etl/steps/data/garden/un/2024-04-09/undp_hdr.meta.yml create mode 100644 etl/steps/data/garden/un/2024-04-09/undp_hdr.py create mode 100644 etl/steps/data/grapher/un/2024-04-09/undp_hdr.py create mode 100644 etl/steps/data/meadow/un/2024-04-09/undp_hdr.py create mode 100644 snapshots/un/2024-04-09/undp_hdr.csv.dvc create mode 100644 snapshots/un/2024-04-09/undp_hdr.py diff --git a/apps/wizard/pages/charts/variable_config.py b/apps/wizard/pages/charts/variable_config.py index 2577e37494b..a69486d219e 100644 --- a/apps/wizard/pages/charts/variable_config.py +++ b/apps/wizard/pages/charts/variable_config.py @@ -1,7 +1,9 @@ """Concerns the second stage of wizard charts, when the variable mapping is constructed.""" + from typing import Any, Dict, List import pandas as pd +import plotly.express as px import streamlit as st from pydantic import BaseModel from streamlit_extras.grid import grid @@ -384,8 +386,23 @@ def plot_comparison_two_variables(df, variable_old, variable_new, var_id_to_disp # st.write(countries) # if countries: # df_variables = df_variables[df_variables["entityName"].isin(countries)] + score = round(100 - df_variables["Relative difference (abs, %)"].mean(), 1) + if score == 100: + score = round(100 - df_variables["Relative difference (abs, %)"].mean(), 2) + if score == 100: + score = round(100 - df_variables["Relative difference (abs, %)"].mean(), 3) + if score == 100: + score = round(100 - df_variables["Relative difference (abs, %)"].mean(), 4) + num_nan_score = df_variables["Relative difference (abs, %)"].isna().sum() + + nrows_0 = df_variables.shape[0] ## Keep only rows with relative difference != 0 df_variables = df_variables[df_variables["Relative difference (abs, %)"] != 0] + ## Keep only rows with different values (old != new) + df_variables = df_variables[ + df_variables[var_id_to_display[variable_old]] != df_variables[var_id_to_display[variable_new]] + ] + nrows_1 = df_variables.shape[0] # Row sanity check ## (Streamlit has a limit on the number of rows it can show) @@ -404,9 +421,31 @@ def plot_comparison_two_variables(df, variable_old, variable_new, var_id_to_disp # cmap="OrRd", subset=["Relative difference (abs, %)"], vmin=0, vmax=20 # ) + # Show preliminary information + nrows_change_relative = round(100 * nrows_1 / nrows_0, 1) + col1, col2 = st.columns([1, 5]) + with col1: + st.metric( + "Data matching score (%)", + score, + help="The data matching score is based on the average of the relative difference between the two variables. A high score indicates a good match. It is estimated as `100 - average(relative scores)`.", + ) + with col2: + st.info( + f""" + - {num_nan_score} rows with unknown score + - {nrows_change_relative} % of the rows changed ({nrows_1} out of {nrows_0}) + """ + ) # Show table st.dataframe(df_variables) + # Show distribution of relative change + fig = px.histogram( + df_variables, x="Relative difference (abs, %)", nbins=100, title="Distribution of relative change" + ) + st.plotly_chart(fig, use_container_width=True) + def reset_variable_form() -> None: """ "Reset variable form.""" diff --git a/dag/archive/main.yml b/dag/archive/main.yml index 33911f6aed4..c1d337d6a3a 100644 --- a/dag/archive/main.yml +++ b/dag/archive/main.yml @@ -107,6 +107,15 @@ steps: data://grapher/wvs/2023-03-08/wvs_trust: - data://garden/wvs/2023-03-08/wvs_trust + # UNDP HDR + data://meadow/un/2022-11-29/undp_hdr: + - snapshot://un/2022-11-29/undp_hdr.csv + - snapshot://un/2022-11-29/undp_hdr.xlsx + data://garden/un/2022-11-29/undp_hdr: + - data://meadow/un/2022-11-29/undp_hdr + data://grapher/un/2022-11-29/undp_hdr: + - data://garden/un/2022-11-29/undp_hdr + # Include all active steps plus all archive steps. include: - dag/main.yml diff --git a/dag/main.yml b/dag/main.yml index 6856295a242..c0801432278 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -170,13 +170,12 @@ steps: - data://meadow/hmd/2022-12-07/life_tables # UNDP - data://meadow/un/2022-11-29/undp_hdr: - - snapshot://un/2022-11-29/undp_hdr.csv - - snapshot://un/2022-11-29/undp_hdr.xlsx - data://garden/un/2022-11-29/undp_hdr: - - data://meadow/un/2022-11-29/undp_hdr - data://grapher/un/2022-11-29/undp_hdr: - - data://garden/un/2022-11-29/undp_hdr + data://meadow/un/2024-04-09/undp_hdr: + - snapshot://un/2024-04-09/undp_hdr.csv + data://garden/un/2024-04-09/undp_hdr: + - data://meadow/un/2024-04-09/undp_hdr + data://grapher/un/2024-04-09/undp_hdr: + - data://garden/un/2024-04-09/undp_hdr # # EM-DAT Natural disasters (2023). diff --git a/etl/steps/data/garden/un/2024-04-09/undp_hdr.countries.json b/etl/steps/data/garden/un/2024-04-09/undp_hdr.countries.json new file mode 100644 index 00000000000..3c46d51ff39 --- /dev/null +++ b/etl/steps/data/garden/un/2024-04-09/undp_hdr.countries.json @@ -0,0 +1,208 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Palestine, State of": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Arab States": "Arab States (UNDP)", + "Congo (Democratic Republic of the)": "Democratic Republic of Congo", + "Eswatini (Kingdom of)": "Eswatini", + "Hong Kong, China (SAR)": "Hong Kong", + "Korea (Democratic People's Rep. of)": "North Korea", + "Korea (Republic of)": "South Korea", + "Moldova (Republic of)": "Moldova", + "Tanzania (United Republic of)": "Tanzania", + "T\u00fcrkiye": "Turkey", + "Europe and Central Asia": "Europe and Central Asia (UNDP)", + "High human development": "High human development (UNDP)", + "Latin America and the Caribbean": "Latin America and the Caribbean (UNDP)", + "Low human development": "Low human development (UNDP)", + "Medium human development": "Medium human development (UNDP)", + "South Asia": "South Asia (UNDP)", + "Sub-Saharan Africa": "Sub-Saharan Africa (UNDP)", + "Very high human development": "Very high human development (UNDP)", + "East Asia and the Pacific": "East Asia and the Pacific (UNDP)" +} diff --git a/etl/steps/data/garden/un/2024-04-09/undp_hdr.meta.yml b/etl/steps/data/garden/un/2024-04-09/undp_hdr.meta.yml new file mode 100644 index 00000000000..42d167457e1 --- /dev/null +++ b/etl/steps/data/garden/un/2024-04-09/undp_hdr.meta.yml @@ -0,0 +1,548 @@ +definitions: + common: + processing_level: minor + presentation: + topic_tags: + - Human Development Index (HDI) + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + title: Human Development Report + update_period_days: 365 + +tables: + undp_hdr: + title: Human Development Report + description: |- + The 2023/24 Human Development Report assesses the dangerous gridlock resulting from uneven development progress, intensifying inequality, and escalating political polarization, that we must urgently tackle. The report emphasizes how global interdependence is being reconfigured and proposes a path forward where multilateralism plays a pivotal role. + + Additional resources related to the 2023/2024 Human Development Report can be found online at http://hdr.undp.org. Resources on the website include digital versions and translations of the Report and the overview in multiple languages, an interactive web version of the Report, a set of background papers and think pieces commissioned for the Report, interactive data visualizations and databases of human development indicators, full explanations of the sources and methodologies used in the Report's composite indices, country insights and other background materials, and previous global, regional and national Human Development Reports. Corrections and addenda are also available online. + + Technical notes (region definitions, reports, etc.) can be found at https://hdr.undp.org/sites/default/files/2023-24_HDR/hdr2023-24_technical_notes.pdf. + common: + display: + numDecimalPlaces: 2 + + variables: + ########################################################### + # 1 Human Development Index + ########################################################### + hdi: + title: Human Development Index + unit: "" + description_short: |- + The HDI is a summary measure of key dimensions of human development: a long and healthy life, a good education, and a decent standard of living. Higher values indicate higher human development. + description_key: &description_key_hdi + - "Each of the dimensions of the HDI is measured with four indicators: a long and healthy life is measured by _life expectancy at birth_, good education (knowledge) is measured by two indicators, _expected_ and _mean_ _years of schooling_; and a decent standard of living is measured by _Gross National Income (GNI) per capita_, logarithmized to reflect that incomes become less important as they increase." + - "The index is then calculated by normalizing and aggregating the indicators. First, the indicators are brought onto the same scale, ranging from 0 to 1. This is done by setting minimum and maximum values for each indicator, and a country at or below the minimum value receiving a score of 0, and a country at or above the maximum value receiving a score of 1." + - "The minimum and maximum values for each indicator are defined as follows: _Life expectancy at birth_ ranges between 20 and 85 years; _expected years of schooling_ between 0 and 18 years; _mean years of schooling_, between 0 and 15 years; and _GNI per capita_ between 100 and 75,000 international-$ at 2017 prices." + - The HDI is then estimated as the geometric mean of these indices, or _HDI = (Health index * Education index * Income index)^(1/3)_. The education index is the arithmetic mean (average) of the mean years of schooling and expected years of schooling. + description_from_producer: &description_prod_hdi |- + The Human Development Index (HDI) is a summary measure of average achievement in key dimensions of human development: a long and healthy life, being knowledgeable and having a decent standard of living. The HDI is the geometric mean of normalized indices for each of the three dimensions. + + The health dimension is assessed by life expectancy at birth, the education dimension is measured by mean of years of schooling for adults aged 25 years and more and expected years of schooling for children of school entering age. The standard of living dimension is measured by gross national income per capita. The HDI uses the logarithm of income, to reflect the diminishing importance of income with increasing GNI. The scores for the three HDI dimension indices are then aggregated into a composite index using geometric mean. Refer to Technical notes for more details. + + The HDI can be used to question national policy choices, asking how two countries with the same level of GNI per capita can end up with different human development outcomes. These contrasts can stimulate debate about government policy priorities. + + The HDI simplifies and captures only part of what human development entails. It does not reflect on inequalities, poverty, human security, empowerment, etc. The HDRO provides other composite indices as broader proxy on some of the key issues of human development, inequality, gender disparity and poverty. + + A fuller picture of a country's level of human development requires analysis of other indicators and information presented in the HDR statistical annex. + display: + numDecimalPlaces: 3 + presentation: + topic_tags: &topic_tags_hdi + - Human Development Index (HDI) + - Life Expectancy + - Global Education + - Economic Growth + grapher_config: &grapher_config_hdi + title: Human Development Index + subtitle: "The Human Development Index (HDI) is a summary measure of key dimensions of human development: a long and healthy life, a good education, and a decent standard of living. Higher values indicate higher human development." + originUrl: https://ourworldindata.org/human-development-index/ + hasMapTab: true + tab: map + yAxis: + max: 1 + min: 0 + hideRelativeToggle: false + map: + time: latest + colorScale: + baseColorScheme: GnBu + binningStrategy: manual + legendDescription: '' + customNumericValues: + - 0.4 + - 0.5 + - 0.6 + - 0.7 + - 0.8 + - 0.9 + - 1 + customNumericMinValue: 1 + selectedEntityNames: + - United Kingdom + - United States + - South Korea + - India + - China + - Brazil + - Nigeria + - Democratic Republic of Congo + - Niger + - Chile + $schema: https://files.ourworldindata.org/schemas/grapher-schema.003.json + + hdi_f: + title: Human Development Index (female) + unit: "" + description_short: |- + The HDI is a summary measure of key dimensions of human development (only females): a long and healthy life, a good education, and a decent standard of living. Higher values indicate higher human development. + description_key: *description_key_hdi + description_from_producer: *description_prod_hdi + display: + numDecimalPlaces: 3 + presentation: + topic_tags: *topic_tags_hdi + grapher_config: + title: Human Development Index (females) + <<: *grapher_config_hdi + + hdi_m: + title: Human Development Index (male) + unit: "" + description_short: |- + The HDI is a summary measure of key dimensions of human development (only males): a long and healthy life, a good education, and a decent standard of living. Higher values indicate higher human development. + description_key: *description_key_hdi + description_from_producer: *description_prod_hdi + display: + numDecimalPlaces: 3 + presentation: + topic_tags: *topic_tags_hdi + grapher_config: + title: Human Development Index (males) + <<: *grapher_config_hdi + + hdi_rank: + title: HDI Rank + unit: "" + description_short: |- + Country's global rank based on its Human Development Index (HDI) value. + display: + numDecimalPlaces: 0 + presentation: + topic_tags: *topic_tags_hdi + + ########################################################### + # 2 Inequality-adjusted Human Development Index + ########################################################### + # Inequality in education + ineq_edu: + title: Inequality in education + unit: "" + description_short: |- + The [Atkinson index](#dod:atkinson) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality. Inequality is measured here in terms of the number of years adults older than 25 participated in formal education. + description_key: &description_key_ineq + - This metric is needed to estimate the Inequality-adjusted Human Development Index. + - This version of the Atkinson index uses an inequality aversion parameter ε equal to 1. The parameter defines how sensitive the index is to changes in the lower end of the distribution. In this form, the inequality measure is A = 1 - g/μ, where g is the geometric mean and μ is the arithmetic mean of the distribution. + display: + numDecimalPlaces: 2 + presentation: + topic_tags: + - Human Development Index (HDI) + - Global Education + # Inequality in income + ineq_inc: + title: Inequality in income + unit: "" + description_short: |- + The [Atkinson index](#dod:atkinson) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality. Depending on the country and year, the data relates to income measured after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + description_key: *description_key_ineq + display: + numDecimalPlaces: 2 + presentation: + topic_tags: + - Human Development Index (HDI) + - Economic Growth + # Inequality in life expectancy + ineq_le: + title: Inequality in Life expectancy + unit: "" + description_short: |- + The [Atkinson index](#dod:atkinson) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality. Inequality is measured here in terms of the number of years a newborn would live if age-specific mortality rates in the current year were to stay the same throughout its life. + description_key: *description_key_ineq + display: + numDecimalPlaces: 2 + presentation: + topic_tags: + - Human Development Index (HDI) + - Life Expectancy + # Coefficient of human inequality + coef_ineq: + title: Coefficient of human inequality + unit: "" + description_short: |- + Unweighted average (i.e. arithmetic mean) of inequalities in health, education and income. + description_key: + - When all inequalities in dimensions are of a similar magnitude, the coefficient of human inequality and the loss in HDI value differ negligibly. + - When inequalities differ in magnitude, the loss in HDI value tends to be higher than the coefficient of human inequality. + display: + numDecimalPlaces: 2 + presentation: + topic_tags: *topic_tags_hdi + # Inequality-adjusted HDI + ihdi: + title: Inequality-adjusted Human Development Index + unit: "" + description_short: |- + The Inequality-adjusted Human Development Index (IHDI) is a summary measure of key dimensions of human development: a long and healthy life, a good education, and a decent standard of living, adjusted for inequalities in these dimensions. Higher values indicate higher and more equal human development. + description_key: + - The Inequality-adjusted Human Development Index (IHDI) adjusts the Human Development Index (HDI) for inequality in the distribution of each dimension across the population. + - It is based on a distribution-sensitive class of composite indices proposed by Foster, Lopez-Calva and Szekely (2005), which draws on the Atkinson (1970) family of inequality measures. It is computed as a geometric mean of inequality-adjusted dimensional indices. + - The IHDI accounts for inequalities in HDI dimensions by “discounting” each dimension's average value according to its level of inequality. The IHDI value equals the HDI value when there is no inequality across people but falls below the HDI value as inequality rises. In this sense the IHDI measures the level of human development when inequality is accounted for. + display: + numDecimalPlaces: 3 + presentation: + topic_tags: *topic_tags_hdi + + ########################################################### + # 3 Gender Development Index + ########################################################### + # Gender Development Index + gdi: + title: Gender Development Index + unit: "" + description_short: |- + The Gender Development Index (GDI) measures gender inequalities in the achievement of key dimensions of human development: a long and healthy life, a good education, and a decent standard of living. The groups below define the level of equality in these achievements as the absolute deviation of GDI from gender parity. + description_key: + - "The Gender Development Index (GDI) measures gender inequalities in achievement in three basic dimensions of human development: health, measured by female and male life expectancy at birth; education, measured by female and male expected years of schooling for children and female and male mean years of schooling for adults ages 25 years and older; and command over economic resources, measured by female and male estimated earned income." + - Values below 1 indicate higher human development for men than women, while values above 1 indicate the opposite. Values close to 1 therefore indicate higher gender equality. + + display: + numDecimalPlaces: 3 + # GDI Group + gdi_group: + title: GDI Group + unit: "" + description_short: |- + The Gender Development Index (GDI) measures gender inequalities in the achievement of key dimensions of human development: a long and healthy life, a good education, and a decent standard of living. The groups below define the level of equality in these achievements as the absolute deviation of GDI from gender parity. + description_key: + - The GDI groups are based on the absolute deviation of GDI from gender parity, 100 ∙ |GDI - 1|. + - "Group 1: Countries with absolute deviation from gender parity of 2.5 percent or less are considered countries with high equality in HDI achievements between women and men." + - "Group 2: Countries with absolute deviation from gender parity of 2.5-5 percent are considered countries with medium-high equality in HDI achievements between women and men." + - "Group 3: Countries with absolute deviation from gender parity of 5-7.5 percent are considered countries with medium equality in HDI achievements between women and men." + - "Group 4: Countries with absolute deviation from gender parity of 7.5-10 percent are considered countries with medium-low equality in HDI achievements between women and men." + - "Group 5: Countries with absolute deviation from gender parity of more than 10 percent are considered countries with low equality in HDI achievements between women and men." + display: + numDecimalPlaces: 0 + + ########################################################### + # 4 Gender Inequality Index + ########################################################### + # Gender Inequality Index + gii: + title: Gender Inequality Index + unit: "" + description_short: |- + This index covers three dimensions: reproductive health, empowerment, and economic status. Scores are between 0-1 and higher values indicate higher inequalities. + description_key: + - The Gender Inequality Index (GII) reflects gender-based disadvantage in three dimensions— reproductive health, empowerment and the labour market—for as many countries as data of reasonable quality allow. It shows the loss in potential human development due to inequality between female and male achievements in these dimensions. It ranges from 0, where women and men fare equally, to 1, where one gender fares as poorly as possible in all measured dimensions. + - GII values are computed using the association-sensitive inequality measure suggested by Seth (2009), which implies that the index is based on the general mean of general means of different orders—the first aggregation is by a geometric mean across dimensions; these means, calculated separately for women and men, are then aggregated using a harmonic mean across genders. + display: + numDecimalPlaces: 3 + # GII rank + gii_rank: + title: GII Rank + unit: "" + description_short: Country's global rank based on its Gender Inequality Index (GII) value. + display: + numDecimalPlaces: 0 + + ########################################################### + # 6 Planetary pressures–adjusted Human Development Index + ########################################################### + # Planetary pressures–adjusted Human Development Index + phdi: + title: Planetary pressures-adjusted Human Development Index + unit: "" + description_short: |- + Adjustment of the Human Development Index (HDI) for planetary pressures in the Anthropocene. The PHDI discounts the HDI for pressures on the planet to reflect a concern for intergenerational inequality, similar to the Inequality-adjusted HDI adjustment, which is motivated by a concern for intragenerational inequality. + description_key: + - The PHDI is computed as the product of the HDI and (1 - index of planetary pressures), where (1 - index of planetary pressures) can be seen as an adjustment factor. + - The PHDI is the level of human development adjusted by carbon dioxide emissions per capita (production) and material footprint per capita to account for the excessive human pressure on the planet. + - The PHDI should be seen as an incentive for transformation. In an ideal scenario, with no pressures on the planet, the PHDI equals the HDI. + - However, as pressures increase, the PHDI falls below the HDI. In this sense the PHDI measures the level of human development when planetary pressures are considered. + + # Differences between HDI and IHDI + diff_hdi_phdi: + title: Difference from HDI value + unit: "%" + short_unit: "%" + description_short: |- + Difference in ranks on the PHDI and the HDI is calculated only for countries for which both an IHDI value and an HDI value are calculated. Due to data limitations, PHDI values are calculated for a smaller set of countries. + display: + numDecimalPlaces: 2 + rankdiff_hdi_phdi: + title: Difference from HDI rank + unit: "" + description_short: |- + To calculate the IHDI rank difference from the HDI rank, HDI ranks are recalculated for the set of countries for which an IHDI value is calculated. + display: + numDecimalPlaces: 0 + + mf: + title: Material footprint per capita + unit: "tonnes per capita" + description_short: |- + Attribution of global material extraction to domestic final demand of a country. Material footprint is calculated as raw material equivalent of imports plus domestic extraction minus raw material equivalents of exports. + description_key: + - The total material footprint is the sum of the material footprint for biomass, fossil fuels, metal ores and nonmetal ores. + - Material footprint per capita describes the average material use for final demand. + description_from_producer: |- + UNDP relies on UNEP (2023). + display: + numDecimalPlaces: 1 + + ########################################################### + # 0 Core indicators + # + # This indicators are provided by UNDP, and used by them to + # estimate their main indicators (e.g. HDI, GII, etc.) + ########################################################### + ## Life Expectancy + le: + title: Life expectancy at birth + unit: years + short_unit: years + description_short: |- + Average number of years a newborn would live if age-specific mortality rates in the current year were to stay the same throughout its life. + description_from_producer: &description_prod_le |- + UNDP originally obtained this indicator from UNDESA (2022). + + le_f: + title: Life expectancy at birth (female) + unit: years + short_unit: years + description_short: |- + Average number of years a female newborn would live if age-specific mortality rates in the current year were to stay the same throughout its life. + description_from_producer: *description_prod_le + + le_m: + title: Life expectancy at birth (male) + unit: years + short_unit: years + description_short: |- + Average number of years a male newborn would live if age-specific mortality rates in the current year were to stay the same throughout its life. + description_from_producer: *description_prod_le + + ## Expected years of schooling + eys: + title: Expected years of schooling + unit: years + short_unit: years + description_short: |- + Number of years a child of school-entrance-age can expect to receive if the current age-specific enrollment rates persist throughout the child’s years of schooling. + description_from_producer: &description_prod_eys |- + UNDP relies on: CEDLAS and World Bank (2023), ICF Macro Demographic and Health Surveys (various years), UNESCO Institute for Statistics (2023) and United Nations Children's Fund (UNICEF) Multiple Indicator Cluster Surveys (various years). + + eys_f: + title: Expected years of schooling (female) + unit: years + short_unit: years + description_short: |- + Number of years a child (female) of school-entrance-age can expect to receive if the current age-specific enrollment rates persist throughout the child’s years of schooling. + description_from_producer: *description_prod_eys + + eys_m: + title: Expected years of schooling (male) + unit: years + short_unit: years + description_short: |- + Number of years a child (male) of school-entrance-age can expect to receive if the current age-specific enrollment rates persist throughout the child’s years of schooling. + description_from_producer: *description_prod_eys + + ## Mean years of schooling + mys: + title: Average years of schooling + unit: years + short_unit: years + description_short: |- + Average number of years adults over 25 years participated in formal education. + description_key: &description_key_mys + - Formal education is primary/ISCED 1 or higher. + - This indicator does not include years spent repeating grades. + description_from_producer: &description_prod_mys |- + UNDP relies on: Barro and Lee (2018), ICF Macro Demographic and Health Surveys (various years), OECD (2023), UNESCO Institute for Statistics (2023) and UNICEF Multiple Indicator Cluster Surveys (various years). + + mys_f: + title: Average years of schooling (female) + unit: years + short_unit: years + description_short: |- + Average number of years (excluding years spent repeating individual grades) adults over 25 years participated in formal education. + description_key: *description_key_mys + description_from_producer: *description_prod_mys + + mys_m: + title: Average years of schooling (male) + unit: years + short_unit: years + description_short: |- + Average number of years (excluding years spent repeating individual grades) adults over 25 years participated in formal education. + description_key: *description_key_mys + description_from_producer: *description_prod_mys + + ## GNI per capita + gnipc: + title: Gross national income per capita + short_unit: $ + unit: international-$ in 2017 prices + description_short: |- + Measures the total income earned by residents of a country, including income earned abroad. This data is adjusted for inflation and differences in the cost of living between countries. + description_key: &description_key_gnipc + - GNI per capita (formerly GNP per capita) is the gross national income, converted to U.S. dollars using the World Bank Atlas method, divided by the midyear population. GNI is the sum of value added by all resident producers plus any product taxes (less subsidies) not included in the valuation of output plus net receipts of primary income (compensation of employees and property income) from abroad (World Bank). + - GNI, calculated in national currency, is usually converted to U.S. dollars at official exchange rates for comparisons across economies, although an alternative rate is used when the official exchange rate is judged to diverge by an exceptionally large margin from the rate actually applied in international transactions (World Bank). + description_from_producer: &description_prod_gnipc |- + UNDP relies on IMF (2023), UNDESA (2023), United Nations Statistics Division (2023), World Bank (2023). + + gni_pc_m: + title: Gross national income per capita (male) + short_unit: $ + unit: international-$ in 2017 prices + description_short: |- + Measures the total income earned by male residents of a country, including income earned abroad. This data is adjusted for inflation and differences in the cost of living between countries. + description_key: *description_key_gnipc + description_from_producer: *description_prod_gnipc + + gni_pc_f: + title: Gross national income per capita (female) + short_unit: $ + unit: international-$ in 2017 prices + description_short: |- + Measures the total income earned by female residents of a country, including income earned abroad. This data is adjusted for inflation and differences in the cost of living between countries. + description_key: *description_key_gnipc + description_from_producer: *description_prod_gnipc + + ########################################################### + # Others + ########################################################### + abr: + title: Adolescent Birth Rate + unit: births per 1,000 women ages 15-19 + description_short: |- + Annual number of births to women aged 15-19 years per 1,000 women in that age group. It is also referred to as the age-specific fertility rate for women aged 15-19. + description_from_producer: |- + UNDP relies on UNDESA (2022). + + # Labour force + lfpr_f: + title: Labour force participation rate (female) + short_unit: "%" + unit: "% ages 15 and older" + description_short: |- + Measures the proportion of a country's female working-age population that engages actively in the labour market, either by working or looking for work. + description_key: &description_key_lfpr + - It provides an indication of the size of the supply of labour available to engage in the production of goods and services, relative to the population at working age (ILOSTAT). + - The breakdown of the labour force (formerly known as economically active population) by sex and age group gives a profile of the distribution of the labour force within a country (ILOSTAT). + description_from_producer: &description_prod_lfpr |- + UNDP relies on ILO (2023). + display: + numDecimalPlaces: 1 + + lfpr_m: + title: Labour force participation rate (male) + short_unit: "%" + unit: "% ages 15 and older" + description_short: |- + Measures the proportion of a country's male working-age population that engages actively in the labour market, either by working or looking for work. + description_key: *description_key_lfpr + description_from_producer: *description_prod_lfpr + + # Maternal mortality rate + mmr: + title: Maternal mortality ratio + unit: "deaths per 100,000 live births" + description_short: |- + The number of maternal deaths during a given time period per 100,000 live births during the same time period. + description_key: + - It depicts the risk of maternal death relative to the number of live births and essentially captures the risk of death in a single pregnancy or a single live birth (WHO). + - Maternal deaths are defined as "the annual number of female deaths from any cause related to or aggravated by pregnancy or its management (excluding accidental or incidental causes) during pregnancy and childbirth or within 42 days of termination of pregnancy, irrespective of the duration and site of the pregnancy, expressed per 100,000 live births, for a specified time period." (WHO). + - Live births are defined as "the complete expulsion or extraction from its mother of a product of conception, irrespective of the duration of the pregnancy, which, after such separation, breathes or shows any other evidence of life such as beating of the heart, pulsation of the umbilical cord, or definite movement of voluntary muscles, whether or not the umbilical cord has been cut or the placenta is attached." (WHO). + - (ICD-10) International reporting of maternal mortality For the purpose of international reporting of maternal mortality, only those maternal deaths occurring before the end of the 42-day reference period should be included in the calculation of the various ratios and rates. The recording of later deaths is encouraged to inform national, regional, and global understanding of these events (WHO). + description_from_producer: |- + UNDP relies on WHO, UNICEF, UNFPA, World Bank Group and UNDESA/Population Division (2023). + + # Parliament seats + pr_f: + title: Share of seats in parliament (female) + short_unit: "%" + unit: "% held by women" + description_short: |- + Share of all seats in parliaments that are held by female representatives. + description_from_producer: |- + UNDP relies on IPU (2023). + + pr_m: + title: Share of seats in parliament (male) + short_unit: "%" + unit: "% held by men" + description_short: |- + Share of all seats in parliaments that are held by male representatives. + description_from_producer: |- + UNDP relies on IPU (2023). + + # Secondary Education + se_f: + title: Population with at least some secondary education (female) + short_unit: "%" + unit: "% ages 25 and older" + description_short: |- + The percentage of population (female, age 25 and over) with at least completed lower secondary education (ISCED 2 or higher). + description_key: + - This indicator is calculated by dividing the number of persons (females) aged 25 years and above with completed lower secondary education by the total female population of the same age group and multiplying the result by 100. + - The UNESCO Institute for Statistics (UIS) educational attainment dataset shows the educational composition of the population aged 25 years and above and hence the stock and quality of human capital within a country. + - The dataset also reflects the structure and performance of the education system and its accumulated impact on human capital formation. + description_from_producer: |- + Data was soured by UNDP from: Barro and Lee (2018), ICF Macro Demographic and Health Surveys (various years), OECD (2023), UNESCO Institute for Statistics (2023) and United Nations Children's Fund Multiple Indicator Cluster Surveys (various years). + display: + numDecimalPlaces: 1 + se_m: + title: Population with at least some secondary education (male) + short_unit: "%" + unit: "% ages 25 and older" + description_short: |- + The percentage of population (male, age 25 and over) with at least completed lower secondary education (ISCED 2 or higher). + description_key: + - This indicator is calculated by dividing the number of persons (male) aged 25 years and above with completed lower secondary education by the total male population of the same age group and multiplying the result by 100. + - The UNESCO Institute for Statistics (UIS) educational attainment dataset shows the educational composition of the population aged 25 years and above and hence the stock and quality of human capital within a country. + - The dataset also reflects the structure and performance of the education system and its accumulated impact on human capital formation. + description_from_producer: |- + Data was soured by UNDP from: Barro and Lee (2018), ICF Macro Demographic and Health Surveys (various years), OECD (2023), UNESCO Institute for Statistics (2023) and United Nations Children's Fund Multiple Indicator Cluster Surveys (various years). + display: + numDecimalPlaces: 1 + + # Others + co2_prod: + title: Carbon dioxide emissions per capita (production) + unit: "tonnes per capita" + description_short: |- + Carbon dioxide emissions produced as a consequence of human activities (use of coal, oil and gas for combustion and industrial processes, gas flaring and cement manufacture), divided by midyear population. Values are territorial emissions, meaning that emissions are attributed to the country in which they physically occur. + description_from_producer: |- + Data is originally sourced from UNDESA (2022) by UNDP. + + loss: + title: Overall loss + short_unit: "%" + unit: "%" + description_short: | + Percentage difference between the IHDI value and the HDI value. + display: + numDecimalPlaces: 1 + + pop_total: + title: Total population + unit: "millions of people" + description_short: |- + The total number of people living in a country. diff --git a/etl/steps/data/garden/un/2024-04-09/undp_hdr.py b/etl/steps/data/garden/un/2024-04-09/undp_hdr.py new file mode 100644 index 00000000000..f2168e353c3 --- /dev/null +++ b/etl/steps/data/garden/un/2024-04-09/undp_hdr.py @@ -0,0 +1,62 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("undp_hdr") + + # Read table from meadow dataset. + tb = ds_meadow["undp_hdr"].reset_index() + + # + # Process data. + # + # Harmonize country names. + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + + # Drop irrelevant columns + tb = tb.drop(columns=["iso3", "hdicode", "region"]) + + # Re-shape table to get (country, year) as index and variables as columns. + tb = tb.melt(id_vars=["country"]) + tb[["variable", "year"]] = tb["variable"].str.extract(r"(.*)_(\d{4})") + tb = tb.pivot(index=["country", "year"], columns="variable", values="value").reset_index() + + # Make Atkinson indices not percentages + atkinson_cols = ["ineq_edu", "ineq_inc", "ineq_le", "coef_ineq"] + for col in atkinson_cols: + tb[col] /= 100 + + # Set dtypes + tb = tb.astype( + { + "country": "category", + "year": int, + **{col: "Float64" for col in tb.columns if col not in ["country", "year"]}, + } + ) + + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/un/2024-04-09/undp_hdr.py b/etl/steps/data/grapher/un/2024-04-09/undp_hdr.py new file mode 100644 index 00000000000..ff036ccf523 --- /dev/null +++ b/etl/steps/data/grapher/un/2024-04-09/undp_hdr.py @@ -0,0 +1,32 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("undp_hdr") + + # Read table from garden dataset. + tb = ds_garden["undp_hdr"] + + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/un/2024-04-09/undp_hdr.py b/etl/steps/data/meadow/un/2024-04-09/undp_hdr.py new file mode 100644 index 00000000000..889ed21fcaf --- /dev/null +++ b/etl/steps/data/meadow/un/2024-04-09/undp_hdr.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("undp_hdr.csv") + + # Load data from snapshot. + tb = snap.read(encoding="ISO-8859-1") + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/un/2024-04-09/undp_hdr.csv.dvc b/snapshots/un/2024-04-09/undp_hdr.csv.dvc new file mode 100644 index 00000000000..d6b4de8d765 --- /dev/null +++ b/snapshots/un/2024-04-09/undp_hdr.csv.dvc @@ -0,0 +1,35 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Human Development Report + description: |- + The 2023/24 Human Development Report assesses the dangerous gridlock resulting from uneven development progress, intensifying inequality, and escalating political polarization, that we must urgently tackle. The report emphasizes how global interdependence is being reconfigured and proposes a path forward where multilateralism plays a pivotal role. + + Additional resources related to the 2023/2024 Human Development Report can be found online at http://hdr.undp.org. Resources on the website include digital versions and translations of the Report and the overview in multiple languages, an interactive web version of the Report, a set of background papers and think pieces commissioned for the Report, interactive data visualizations and databases of human development indicators, full explanations of the sources and methodologies used in the Report’s composite indices, country insights and other background materials, and previous global, regional and national Human Development Reports. Corrections and addenda are also available online. + + Technical notes (region definitions, reports, etc.) can be found at https://hdr.undp.org/sites/default/files/2023-24_HDR/hdr2023-24_technical_notes.pdf. + date_published: "2024-03-14" + version_producer: 2023-2024 + + # Citation + producer: UNDP, Human Development Report + citation_full: |- + UNDP (United Nations Development Programme). 2024. Human Development Report 2023-24: Breaking the gridlock: Reimagining cooperation in a polarized world. New York. + attribution_short: UNDP + + # Files + url_main: https://hdr.undp.org/ + url_download: https://hdr.undp.org/sites/default/files/2023-24_HDR/HDR23-24_Composite_indices_complete_time_series.csv + date_accessed: 2024-04-09 + + # License + license: + name: CC BY 3.0 IGO + url: https://hdr.undp.org/copyright-and-terms-use + +outs: + - md5: 8cb4c1fc8a738416f514c44fcae0420c + size: 1919243 + path: undp_hdr.csv diff --git a/snapshots/un/2024-04-09/undp_hdr.py b/snapshots/un/2024-04-09/undp_hdr.py new file mode 100644 index 00000000000..72bda208460 --- /dev/null +++ b/snapshots/un/2024-04-09/undp_hdr.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"un/{SNAPSHOT_VERSION}/undp_hdr.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main()