From 56df16750de41ef5fe90865a83bd96bcd609d30d Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 8 Apr 2024 17:55:31 +0100 Subject: [PATCH 01/35] adding snapshot - meadow - garden --- dag/health.yml | 6 + .../who/2024-04-08/polio_afp.countries.json | 181 ++++++++++++++++++ .../polio_afp.excluded_countries.json | 2 + .../garden/who/2024-04-08/polio_afp.meta.yml | 54 ++++++ .../data/garden/who/2024-04-08/polio_afp.py | 37 ++++ .../data/meadow/who/2024-04-08/polio_afp.py | 32 ++++ snapshots/who/2024-04-08/polio_afp.csv.dvc | 27 +++ snapshots/who/2024-04-08/polio_afp.py | 37 ++++ 8 files changed, 376 insertions(+) create mode 100644 etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json create mode 100644 etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json create mode 100644 etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml create mode 100644 etl/steps/data/garden/who/2024-04-08/polio_afp.py create mode 100644 etl/steps/data/meadow/who/2024-04-08/polio_afp.py create mode 100644 snapshots/who/2024-04-08/polio_afp.csv.dvc create mode 100644 snapshots/who/2024-04-08/polio_afp.py diff --git a/dag/health.yml b/dag/health.yml index 7ade31c5ed1..3002b37c7c5 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -512,3 +512,9 @@ steps: - data://garden/demography/2023-03-31/population data://grapher/health/2024-04-02/organ_donation_and_transplantation: - data://garden/health/2024-04-02/organ_donation_and_transplantation + + # Polio AFP surveillance + data://meadow/who/2024-04-08/polio_afp: + - snapshot://who/2024-04-08/polio_afp.csv + data://garden/who/2024-04-08/polio_afp: + - data://meadow/who/2024-04-08/polio_afp diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json b/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json new file mode 100644 index 00000000000..bef15363846 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json @@ -0,0 +1,181 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "China, Hong Kong SAR": "Hong Kong", + "China, Macao SAR": "Macao", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Moldova": "Moldova", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Reunion": "Reunion", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Helena": "Saint Helena", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Serbia and Montenegro": "Serbia and Montenegro", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Togo": "Togo", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Venezuela": "Venezuela", + "Viet Nam": "Vietnam", + "West Bank and Gaza": "Palestine", + "Yemen": "Yemen", + "Yugoslavia": "Yugoslavia", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "CAREC": "Central Asia Regional Economic Cooperation", + "Carec": "Central Asia Regional Economic Cooperation", + "Libyan Arab Jamahiriya": "Libya", + "Pacific Island Countries": "Pacific Island Countries", + "Pacific Island countries": "Pacific Island Countries", + "The former Yugoslav Republic of Macedonia": "North Macedonia", + "Timor Leste": "East Timor" +} \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json b/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json new file mode 100644 index 00000000000..0d4f101c7a3 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json @@ -0,0 +1,2 @@ +[ +] diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml new file mode 100644 index 00000000000..372e7ca97ef --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml @@ -0,0 +1,54 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + +tables: + polio_afp: + variables: + # testing_variable: + # title: Testing variable title + # unit: arbitrary units + # short_unit: au + # description_short: Short description of testing variable. + # description_processing: Description of processing of testing variable. + # description_key: List of key points about the indicator. + # description_from_producer: Description of testing variable from producer. + # processing_level: minor + # presentation: + # attribution: + # attribution_short: + # faqs: + # grapher_config: + # title_public: + # title_variant: + # topic_tags: + # display: + # color: + # conversionFactor: 1 + # description: + # entityAnnotationsMap: Test annotation + # includeInTable: + # isProjection: false + # name: Testing variable + # numDecimalPlaces: 0 + # shortUnit: au + # tableDisplay: + # hideAbsoluteChange: + # hideRelativeChange: + # tolerance: 0 + # unit: arbitrary units + # yearIsDay: false + # zeroDay: + {} + diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio_afp.py new file mode 100644 index 00000000000..159a209389c --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.py @@ -0,0 +1,37 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("polio_afp") + + # Read table from meadow dataset. + tb = ds_meadow["polio_afp"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + tb = tb.set_index(["country", "year"], verify_integrity=True) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/meadow/who/2024-04-08/polio_afp.py b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py new file mode 100644 index 00000000000..8f42fb23fa3 --- /dev/null +++ b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("polio_afp.csv") + + # Load data from snapshot. + tb = snap.read() + tb = tb.rename(columns={"Country / Territory / Region": "country", "Year": "year"}) + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/who/2024-04-08/polio_afp.csv.dvc b/snapshots/who/2024-04-08/polio_afp.csv.dvc new file mode 100644 index 00000000000..fa55d155995 --- /dev/null +++ b/snapshots/who/2024-04-08/polio_afp.csv.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: World Health Organization - Acute Flaccid Paralysis + date_published: "2024-04-04" + + # Citation + producer: World Health Organization + citation_full: |- + World Health Organization (2024) + attribution_short: WHO + + # Files + url_main: https://extranet.who.int/polis/public/CaseCount.aspx + date_accessed: 2024-04-08 + + # License + license: + name: CC BY 4.0 + url: https://www.who.int/about/policies/terms-of-use + +outs: + - md5: 00d57ac66f4507ae66a35fecec365971 + size: 156591 + path: polio_afp.csv diff --git a/snapshots/who/2024-04-08/polio_afp.py b/snapshots/who/2024-04-08/polio_afp.py new file mode 100644 index 00000000000..aae52da6c33 --- /dev/null +++ b/snapshots/who/2024-04-08/polio_afp.py @@ -0,0 +1,37 @@ +"""Script to create a snapshot of dataset. + +To find the data needed to run this step following these steps: + + - Go to https://extranet.who.int/polis/public/CaseCount.aspx + - Select 'World' in the Region list + - Select all countries in the year of onset list (you may need to use cmd+a to do this) + - Ensure the 'Country Detail' box is checked + - Click 'Show data' + - Select the outputted table and copy it to a CSV file, e.g. in excel + - This is the local file to be loaded in the snapshot + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"who/{SNAPSHOT_VERSION}/polio_afp.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() From cbc1fa70628d68ce41b48cef67893328b78c86b9 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Tue, 9 Apr 2024 12:36:00 +0100 Subject: [PATCH 02/35] adding garden and grapher steps --- dag/health.yml | 2 + .../who/2024-04-08/polio_afp.countries.json | 4 - .../polio_afp.excluded_countries.json | 6 +- .../garden/who/2024-04-08/polio_afp.meta.yml | 79 +++++++------- .../data/garden/who/2024-04-08/polio_afp.py | 103 ++++++++++++++++++ .../data/grapher/who/2024-04-08/polio_afp.py | 32 ++++++ 6 files changed, 183 insertions(+), 43 deletions(-) create mode 100644 etl/steps/data/grapher/who/2024-04-08/polio_afp.py diff --git a/dag/health.yml b/dag/health.yml index 3002b37c7c5..8aa14c76f0d 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -518,3 +518,5 @@ steps: - snapshot://who/2024-04-08/polio_afp.csv data://garden/who/2024-04-08/polio_afp: - data://meadow/who/2024-04-08/polio_afp + data://grapher/who/2024-04-08/polio_afp: + - data://garden/who/2024-04-08/polio_afp diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json b/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json index bef15363846..cc4418a3e9b 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json @@ -171,11 +171,7 @@ "Yugoslavia": "Yugoslavia", "Zambia": "Zambia", "Zimbabwe": "Zimbabwe", - "CAREC": "Central Asia Regional Economic Cooperation", - "Carec": "Central Asia Regional Economic Cooperation", "Libyan Arab Jamahiriya": "Libya", - "Pacific Island Countries": "Pacific Island Countries", - "Pacific Island countries": "Pacific Island Countries", "The former Yugoslav Republic of Macedonia": "North Macedonia", "Timor Leste": "East Timor" } \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json b/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json index 0d4f101c7a3..4e9cbd775bc 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json @@ -1,2 +1,6 @@ [ -] + "CAREC", + "Carec", + "Pacific Island Countries", + "Pacific Island countries" +] \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml index 372e7ca97ef..0ecc16e422c 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml @@ -6,49 +6,52 @@ definitions: - Global Health - Eradication of Diseases - # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: update_period_days: 365 - tables: polio_afp: variables: - # testing_variable: - # title: Testing variable title - # unit: arbitrary units - # short_unit: au - # description_short: Short description of testing variable. - # description_processing: Description of processing of testing variable. - # description_key: List of key points about the indicator. - # description_from_producer: Description of testing variable from producer. - # processing_level: minor - # presentation: - # attribution: - # attribution_short: - # faqs: - # grapher_config: - # title_public: - # title_variant: - # topic_tags: - # display: - # color: - # conversionFactor: 1 - # description: - # entityAnnotationsMap: Test annotation - # includeInTable: - # isProjection: false - # name: Testing variable - # numDecimalPlaces: 0 - # shortUnit: au - # tableDisplay: - # hideAbsoluteChange: - # hideRelativeChange: - # tolerance: 0 - # unit: arbitrary units - # yearIsDay: false - # zeroDay: - {} - + afp_cases: + title: Acute Flaccid Paralysis (AFP) cases + description_short: Number of new cases of acute flaccid paralysis (AFP) reported in a given year. + unit: cases + non_polio_afp_rate: + title: Non-Polio acute flaccid paralysis (afp) rate + description_short: The number of cases of acute flaccid paralysis (AFP) per 100,000 children aged 15 or under, not attributed to polio, that were detected and reported to the WHO. + description_key: ["Acute flaccid paralysis (AFP) surveillance is the gold standard for detecting cases of poliomyelitis.", "At least one case of non-polio AFP should be detected annually per 100,000 population aged less than 15 years. In endemic regions, to ensure even higher sensitivity, this rate should be two per 100 000."] + unit: cases per 100,000 children + pct_adequate_stool_collection: + title: Aqueate stool collection (%) + description_short: "The share of acute flaccid paralysis (AFP) cases, where stool samples were tested for poliovirus and reported to the WHO" + unit: "%" + short_unit: "%" + pending: + title: Pending + description_short: "The number of acute flaccid paralysis (AFP) cases pending classification by the WHO" + unit: cases + wild_poliovirus_cases: + title: Wild Poliovirus cases + description_short: "The number of cases of wild poliovirus detected in a given year" + unit: cases + cvdpv_cases: + title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases + description_short: "The number of cases of circulating vaccine-derived poliovirus detected in a given year" + unit: cases + compatibles: + title: Compatibles + unit: "" + footnote: + title: Footnote + unit: "" + correction_factor: + title: Correction factor + description_short: "Correction factor applied to the number of reported polio cases to account for underreporting" + unit: factor + polio_surveillance_status: + title: Polio surveillance status + description_short: "The status of polio surveillance in a given country" + description_key: ["A country is considered to have adequate screening if it has a non-polio AFP rate of at least 2 per 100,000 children aged 15 or under.", "A country is considered to have adequate testing if it has a percentage adequate stool collection of at least 80%.", "Countries are labelled 'low risk' if they were considered low risk by the risk assessment carried out for the [2022 GPEI surveillance action plan](https://polioeradication.org/wp-content/uploads/2022/05/GPSAP-2022-2024-EN.pdf)"] + unit: "" diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio_afp.py index 159a209389c..b66b1a5ab49 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.py @@ -1,11 +1,16 @@ """Load a meadow dataset and create a garden dataset.""" +import pandas as pd +from owid.catalog import Table + from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +SCREENING_YEAR = 2023 + def run(dest_dir: str) -> None: # @@ -23,6 +28,12 @@ def run(dest_dir: str) -> None: tb = geo.harmonize_countries( df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path ) + + # Remove data from before 2001. + tb = remove_pre_2001_data(tb) + tb = clean_adequate_stool_collection(tb) + tb = add_correction_factor(tb) + tb = add_screening_and_testing(tb) tb = tb.set_index(["country", "year"], verify_integrity=True) # @@ -35,3 +46,95 @@ def run(dest_dir: str) -> None: # Save changes in the new garden dataset. ds_garden.save() + + +def add_screening_and_testing(tb: Table, year=SCREENING_YEAR) -> Table: + """ + Adds the polio surveillance status based on the screening and testing rates. + For use in this chart: https://ourworldindata.org/grapher/polio-screening-and-testing + + Parameters: + - tb: table containing polio surveillance data. + - year: Specific year to filter the data. If None, uses current year. + + Returns: + - Modified table with a new column for polio surveillance status. + """ + # tb["polio_surveillance_status"] = pd.NA + tb.loc[ + (tb["non_polio_afp_rate"] >= 2.0) + & (tb["pct_adequate_stool_collection"] >= 80) + & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Adequate screening and testing" + tb.loc[ + (tb["non_polio_afp_rate"] >= 2.0) & (tb["pct_adequate_stool_collection"] < 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate testing" + tb.loc[ + (tb["non_polio_afp_rate"] < 2.0) & (tb["pct_adequate_stool_collection"] >= 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate screening" + tb.loc[ + (tb["non_polio_afp_rate"] < 2.0) & (tb["pct_adequate_stool_collection"] < 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate screening and testing" + return tb + + +def add_correction_factor(tb: Table) -> Table: + """ + Adding the correction factor to estimate polio cases based on reported cases. + + Following Tebbens et al (2011) -https://www.sciencedirect.com/science/article/pii/S0264410X10014957?via%3Dihub + + The correction factor is 7 for all years before 1996. + The correction factor is 1.11 for all countries when 1996 >= year <= 2000 if the 'non_polio_afp_rate' is < 1 OR 'percent_adequate_stool_collection' < 60, then the correction factor = 7. + If the 'non_polio_afp_rate' is < 2 OR 'percent_adequate_stool_collection' < 80, then the correction factor = 2. If the 'non_polio_afp_rate' is >= 2 OR 'percent_adequate_stool_collection' >= 80, then the correction factor = 1.11. + If both 'non_polio_afp_rate' and 'percent_adequate_stool_collection' are missing then the correction factor is 7. + + There are some manual changes we make: + + - Namibia had 'percent_adequate_stool_collection' > 100 in 2011 and 2014 but for other years it's correction factor is 1.11 so we set it as 1.11 for 2011 and 2014. + + - For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988. + + (We set the correction factor as NA for all of 2021 as the values of 'percent_adequate_stool_collection' seemed unreliable in this year.) + + """ + # tb["correction_factor"] = pd.NA + # Correction factor for years 1996-2000 is 1.11. + tb.loc[(tb["year"] >= 1996) & (tb["year"] <= 2000), "correction_factor"] = 1.11 + # If the 'non_polio_afp_rate' is < 1 OR 'percent_adequate_stool_collection' < 60, then the correction factor = 7. + tb.loc[(tb["non_polio_afp_rate"] < 1.0) | (tb["pct_adequate_stool_collection"] < 60), "correction_factor"] = 7.0 + # If the 'non_polio_afp_rate' is < 2 OR 'percent_adequate_stool_collection' < 80, then the correction factor = 2. + tb.loc[(tb["non_polio_afp_rate"] < 2.0) | (tb["pct_adequate_stool_collection"] < 80), "correction_factor"] = 2.0 + # If the 'non_polio_afp_rate' is >= 2 OR 'percent_adequate_stool_collection' >= 80, then the correction factor = 1.11. + tb.loc[(tb["non_polio_afp_rate"] >= 2.0) & (tb["pct_adequate_stool_collection"] >= 80), "correction_factor"] = 1.11 + # If both 'non_polio_afp_rate' and 'percent_adequate_stool_collection' are missing then the correction factor is 7. + tb.loc[(tb["non_polio_afp_rate"].isna()) & (tb["pct_adequate_stool_collection"].isna()), "correction_factor"] = 7.0 + # Correction factor for years before 1996 is 7. + tb.loc[tb["year"] < 1996, "correction_factor"] = 7.0 + + # tb.loc[tb["year"] == 2021, "correction_factor"] = np.nan + + # Namibia had 'percent_adequate_stool_collection' > 100 in 2011 and 2014 but for other years it's correction factor is 1.11 so we set it as 1.11 for 2011 and 2014. + tb["correction_factor"][(tb["country"] == "Namibia") & (tb["year"].isin([2011, 2014]))] = 1.11 + # For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988. + tb["correction_factor"][(tb["country"] == "China") & (tb["year"].isin([1989, 1990, 1991, 1992]))] = 1.11 + tb["correction_factor"][(tb["country"] == "Oman") & (tb["year"].isin([1988]))] = 1.11 + return tb + + +def clean_adequate_stool_collection(tb: Table) -> Table: + """ + Some values for "Adequate stool collection" are over 100%, we should set these to NA. + """ + tb["pct_adequate_stool_collection"][tb["pct_adequate_stool_collection"] > 100] = pd.NA + return tb + + +def remove_pre_2001_data(tb: Table) -> Table: + """Remove data from before 2001.""" + tb = tb[tb["year"] >= 2001] + return tb diff --git a/etl/steps/data/grapher/who/2024-04-08/polio_afp.py b/etl/steps/data/grapher/who/2024-04-08/polio_afp.py new file mode 100644 index 00000000000..fe8a2a5336f --- /dev/null +++ b/etl/steps/data/grapher/who/2024-04-08/polio_afp.py @@ -0,0 +1,32 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("polio_afp") + + # Read table from garden dataset. + tb = ds_garden["polio_afp"] + + # + # Process data. + # + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() From c6f897306f728b6ddea0e1461f092c88c92230e7 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Tue, 9 Apr 2024 14:13:46 +0100 Subject: [PATCH 03/35] fudging the origins issue --- etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml | 2 +- etl/steps/data/garden/who/2024-04-08/polio_afp.py | 4 ++++ etl/steps/data/grapher/who/2024-04-08/polio_afp.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml index 0ecc16e422c..e8192b93c40 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml @@ -24,7 +24,7 @@ tables: description_key: ["Acute flaccid paralysis (AFP) surveillance is the gold standard for detecting cases of poliomyelitis.", "At least one case of non-polio AFP should be detected annually per 100,000 population aged less than 15 years. In endemic regions, to ensure even higher sensitivity, this rate should be two per 100 000."] unit: cases per 100,000 children pct_adequate_stool_collection: - title: Aqueate stool collection (%) + title: Adequate stool collection (%) description_short: "The share of acute flaccid paralysis (AFP) cases, where stool samples were tested for poliovirus and reported to the WHO" unit: "%" short_unit: "%" diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio_afp.py index b66b1a5ab49..ea07e45dd42 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.py @@ -79,6 +79,8 @@ def add_screening_and_testing(tb: Table, year=SCREENING_YEAR) -> Table: (tb["non_polio_afp_rate"] < 2.0) & (tb["pct_adequate_stool_collection"] < 80) & (tb["year"] == SCREENING_YEAR), "polio_surveillance_status", ] = "Inadequate screening and testing" + # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise + tb["polio_surveillance_status"].metadata.origins = tb["non_polio_afp_rate"].metadata.origins return tb @@ -123,6 +125,8 @@ def add_correction_factor(tb: Table) -> Table: # For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988. tb["correction_factor"][(tb["country"] == "China") & (tb["year"].isin([1989, 1990, 1991, 1992]))] = 1.11 tb["correction_factor"][(tb["country"] == "Oman") & (tb["year"].isin([1988]))] = 1.11 + # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise + tb["correction_factor"].metadata.origins = tb["non_polio_afp_rate"].metadata.origins return tb diff --git a/etl/steps/data/grapher/who/2024-04-08/polio_afp.py b/etl/steps/data/grapher/who/2024-04-08/polio_afp.py index fe8a2a5336f..1c2467fb601 100644 --- a/etl/steps/data/grapher/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/grapher/who/2024-04-08/polio_afp.py @@ -19,7 +19,7 @@ def run(dest_dir: str) -> None: # # Process data. # - + tb = tb.drop(columns="footnote") # # Save outputs. # From 751fb68100578c1cdeb620b4ced2064f12df7c37 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Tue, 9 Apr 2024 14:25:49 +0100 Subject: [PATCH 04/35] adding some comments --- etl/steps/data/garden/who/2024-04-08/polio_afp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio_afp.py index ea07e45dd42..803235f1296 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.py @@ -31,8 +31,11 @@ def run(dest_dir: str) -> None: # Remove data from before 2001. tb = remove_pre_2001_data(tb) + # Remove values > 100% for "Adequate stool collection". tb = clean_adequate_stool_collection(tb) + # Add correction factor to estimate polio cases based on reported cases. tb = add_correction_factor(tb) + # Add polio surveillance status based on the screening and testing rates. tb = add_screening_and_testing(tb) tb = tb.set_index(["country", "year"], verify_integrity=True) From a52c87bb51fa096fa0e4e56430b5d266df7ecfbf Mon Sep 17 00:00:00 2001 From: spoonerf Date: Tue, 9 Apr 2024 14:56:14 +0100 Subject: [PATCH 05/35] adding historical polio cases from who web archive --- dag/health.yml | 2 + .../meadow/who/2024-04-09/polio_historical.py | 38 +++++++++++++++++++ snapshots/who/2024-04-09/polio_historical.py | 31 +++++++++++++++ .../who/2024-04-09/polio_historical.xls.dvc | 27 +++++++++++++ 4 files changed, 98 insertions(+) create mode 100644 etl/steps/data/meadow/who/2024-04-09/polio_historical.py create mode 100644 snapshots/who/2024-04-09/polio_historical.py create mode 100644 snapshots/who/2024-04-09/polio_historical.xls.dvc diff --git a/dag/health.yml b/dag/health.yml index 8aa14c76f0d..75184d74aaa 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -520,3 +520,5 @@ steps: - data://meadow/who/2024-04-08/polio_afp data://grapher/who/2024-04-08/polio_afp: - data://garden/who/2024-04-08/polio_afp + data://meadow/who/2024-04-09/polio_historical: + - snapshot://who/2024-04-09/polio_historical.xls diff --git a/etl/steps/data/meadow/who/2024-04-09/polio_historical.py b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..bfb712d668c --- /dev/null +++ b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py @@ -0,0 +1,38 @@ +"""Load a snapshot and create a meadow dataset.""" + +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("polio_historical.xls") + + # Load data from snapshot. + tb = snap.read(sheet_name="Polio") + + # + # Process data. + # + tb = pr.melt(tb, id_vars=["WHO_REGION", "ISO_code", "Cname", "Disease"], var_name="year", value_name="cases") + tb = tb.drop(columns=["WHO_REGION", "ISO_code", "Disease"]) + tb = tb.rename(columns={"Cname": "country"}) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/who/2024-04-09/polio_historical.py b/snapshots/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..bac3f8c044c --- /dev/null +++ b/snapshots/who/2024-04-09/polio_historical.py @@ -0,0 +1,31 @@ +"""Script to create a snapshot of dataset. + +The data is no longer available from the WHO but it is available on web archive e.g. here "https://web.archive.org/web/20200713223806/http://www.who.int/immunization/monitoring_surveillance/data/incidence_series.xls" + +It can be downloaded and then used to create a snapshot from the local file. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"who/{SNAPSHOT_VERSION}/polio_historical.xls") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/who/2024-04-09/polio_historical.xls.dvc b/snapshots/who/2024-04-09/polio_historical.xls.dvc new file mode 100644 index 00000000000..33169b13d00 --- /dev/null +++ b/snapshots/who/2024-04-09/polio_historical.xls.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: WHO Historical Polio Dataset + date_published: "2019-12-10" + + # Citation + producer: World Health Organization + citation_full: |- + World Health Organization (2019) + attribution_short: WHO + + # Files + url_main: https://web.archive.org/web/20200101000000*/http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1 + date_accessed: 2024-04-09 + + # License + license: + name: CC BY 4.0 + url: https://www.who.int/about/policies/terms-of-use + +outs: + - md5: 189201470a046c95b5f38c05a77fd6c2 + size: 612864 + path: polio_historical.xls From faf641bc95bb14765b1be6a2b391939f00966ae1 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Tue, 9 Apr 2024 16:40:46 +0100 Subject: [PATCH 06/35] garden step historical cases --- dag/health.yml | 2 + .../polio_historical.countries.json | 196 ++++++++++++++++++ .../who/2024-04-09/polio_historical.meta.yml | 20 ++ .../garden/who/2024-04-09/polio_historical.py | 36 ++++ 4 files changed, 254 insertions(+) create mode 100644 etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json create mode 100644 etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml create mode 100644 etl/steps/data/garden/who/2024-04-09/polio_historical.py diff --git a/dag/health.yml b/dag/health.yml index 75184d74aaa..8221cb68aba 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -522,3 +522,5 @@ steps: - data://garden/who/2024-04-08/polio_afp data://meadow/who/2024-04-09/polio_historical: - snapshot://who/2024-04-09/polio_historical.xls + data://garden/who/2024-04-09/polio_historical: + - data://meadow/who/2024-04-09/polio_historical diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json b/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json new file mode 100644 index 00000000000..383f9f88205 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json @@ -0,0 +1,196 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe" +} \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml b/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml new file mode 100644 index 00000000000..2882d8f9b3c --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml @@ -0,0 +1,20 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + polio_historical: + variables: + cases: + title: Polio cases + description_short: Number of new cases of polio reported in a given year. + unit: cases diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.py b/etl/steps/data/garden/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..9c549d9fe0b --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.py @@ -0,0 +1,36 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("polio_historical") + + # Read table from meadow dataset. + tb = ds_meadow["polio_historical"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.set_index(["country", "year"], verify_integrity=True) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() From 8337efb44dd2c0d711f147dcea78505b456523aa Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 10 Apr 2024 16:56:05 +0100 Subject: [PATCH 07/35] adding historical data and using correction factor to estimate total cases --- dag/health.yml | 3 +++ etl/steps/data/garden/who/2024-04-08/polio_afp.py | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/dag/health.yml b/dag/health.yml index 8221cb68aba..74b140966a7 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -518,8 +518,11 @@ steps: - snapshot://who/2024-04-08/polio_afp.csv data://garden/who/2024-04-08/polio_afp: - data://meadow/who/2024-04-08/polio_afp + - data://meadow/who/2024-04-09/polio_historical data://grapher/who/2024-04-08/polio_afp: - data://garden/who/2024-04-08/polio_afp + + # Polio historical data data://meadow/who/2024-04-09/polio_historical: - snapshot://who/2024-04-09/polio_historical.xls data://garden/who/2024-04-09/polio_historical: diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio_afp.py index 803235f1296..bcf9a73cf1a 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.py @@ -2,6 +2,7 @@ import pandas as pd from owid.catalog import Table +from owid.catalog import processing as pr from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -18,10 +19,14 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("polio_afp") + ds_historical = paths.load_dataset("polio_historical") # Read table from meadow dataset. tb = ds_meadow["polio_afp"].reset_index() - + tb_hist = ds_historical["polio_historical"].reset_index() + tb_hist = tb_hist.rename(columns={"cases": "total_cases"}) + # Only need this for data prior to 2001 + tb_hist = tb_hist[tb_hist["year"] < 2001] # # Process data. # @@ -33,10 +38,16 @@ def run(dest_dir: str) -> None: tb = remove_pre_2001_data(tb) # Remove values > 100% for "Adequate stool collection". tb = clean_adequate_stool_collection(tb) + # Add total cases + tb["total_cases"] = tb["wild_poliovirus_cases"] + tb["cvdpv_cases"] + # Need to deal with overlapping years + tb = pr.concat([tb, tb_hist], axis=0) # Add correction factor to estimate polio cases based on reported cases. tb = add_correction_factor(tb) + tb["estimated_cases"] = tb["total_cases"] * tb["correction_factor"] # Add polio surveillance status based on the screening and testing rates. tb = add_screening_and_testing(tb) + tb = tb.set_index(["country", "year"], verify_integrity=True) # From b9e38e04ce08abeca7ac9eba4a21799902693740 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 10 Apr 2024 17:00:48 +0100 Subject: [PATCH 08/35] adding metadata --- .../polio_afp.excluded_countries.json | 3 ++- .../garden/who/2024-04-08/polio_afp.meta.yml | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json b/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json index 4e9cbd775bc..94f87976655 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json @@ -2,5 +2,6 @@ "CAREC", "Carec", "Pacific Island Countries", - "Pacific Island countries" + "Pacific Island countries", + "Yugoslavia" ] \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml index e8192b93c40..cefbce33ba4 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml @@ -18,6 +18,8 @@ tables: title: Acute Flaccid Paralysis (AFP) cases description_short: Number of new cases of acute flaccid paralysis (AFP) reported in a given year. unit: cases + display: + numDecimalPlaces: 0 non_polio_afp_rate: title: Non-Polio acute flaccid paralysis (afp) rate description_short: The number of cases of acute flaccid paralysis (AFP) per 100,000 children aged 15 or under, not attributed to polio, that were detected and reported to the WHO. @@ -36,10 +38,14 @@ tables: title: Wild Poliovirus cases description_short: "The number of cases of wild poliovirus detected in a given year" unit: cases + display: + numDecimalPlaces: 0 cvdpv_cases: title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases description_short: "The number of cases of circulating vaccine-derived poliovirus detected in a given year" unit: cases + display: + numDecimalPlaces: 0 compatibles: title: Compatibles unit: "" @@ -55,3 +61,15 @@ tables: description_short: "The status of polio surveillance in a given country" description_key: ["A country is considered to have adequate screening if it has a non-polio AFP rate of at least 2 per 100,000 children aged 15 or under.", "A country is considered to have adequate testing if it has a percentage adequate stool collection of at least 80%.", "Countries are labelled 'low risk' if they were considered low risk by the risk assessment carried out for the [2022 GPEI surveillance action plan](https://polioeradication.org/wp-content/uploads/2022/05/GPSAP-2022-2024-EN.pdf)"] unit: "" + total_cases: + title: Total polio cases + description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases + unit: cases + display: + numDecimalPlaces: 0 + estimated_cases: + title: Estimated polio cases + description_short: The total estimated number of polio cases, using Tebbens et al. + unit: cases + display: + numDecimalPlaces: 0 From 29b96a9a6383661f306a49ee16d5196bd569af60 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 10 Apr 2024 20:57:17 +0100 Subject: [PATCH 09/35] adding regions aggs --- dag/health.yml | 2 ++ .../who/2024-04-08/polio_afp.countries.json | 1 - .../data/garden/who/2024-04-08/polio_afp.py | 21 +++++++++++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/dag/health.yml b/dag/health.yml index 74b140966a7..ea77d479cac 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -519,6 +519,8 @@ steps: data://garden/who/2024-04-08/polio_afp: - data://meadow/who/2024-04-08/polio_afp - data://meadow/who/2024-04-09/polio_historical + - data://garden/wb/2023-04-30/income_groups + - data://garden/regions/2023-01-01/regions data://grapher/who/2024-04-08/polio_afp: - data://garden/who/2024-04-08/polio_afp diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json b/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json index cc4418a3e9b..c8fc7c38076 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json @@ -168,7 +168,6 @@ "Viet Nam": "Vietnam", "West Bank and Gaza": "Palestine", "Yemen": "Yemen", - "Yugoslavia": "Yugoslavia", "Zambia": "Zambia", "Zimbabwe": "Zimbabwe", "Libyan Arab Jamahiriya": "Libya", diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio_afp.py index bcf9a73cf1a..7b04fc09b40 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.py @@ -4,14 +4,18 @@ from owid.catalog import Table from owid.catalog import processing as pr -from etl.data_helpers import geo +from etl.data_helpers.geo import add_regions_to_table, harmonize_countries from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +# Year to use for the screening and testing rates. +# Should be the most recent year of complete data. SCREENING_YEAR = 2023 +REGIONS = ["North America", "South America", "Europe", "Africa", "Asia", "Oceania", "World"] + def run(dest_dir: str) -> None: # @@ -20,6 +24,10 @@ def run(dest_dir: str) -> None: # Load meadow dataset. ds_meadow = paths.load_dataset("polio_afp") ds_historical = paths.load_dataset("polio_historical") + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") # Read table from meadow dataset. tb = ds_meadow["polio_afp"].reset_index() @@ -30,7 +38,7 @@ def run(dest_dir: str) -> None: # # Process data. # - tb = geo.harmonize_countries( + tb = harmonize_countries( df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path ) @@ -42,6 +50,15 @@ def run(dest_dir: str) -> None: tb["total_cases"] = tb["wild_poliovirus_cases"] + tb["cvdpv_cases"] # Need to deal with overlapping years tb = pr.concat([tb, tb_hist], axis=0) + # Add region aggregates. + tb_reg = add_regions_to_table( + tb[["country", "year", "afp_cases", "wild_poliovirus_cases", "cvdpv_cases", "total_cases"]], + regions=REGIONS, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + min_num_values_per_year=1, + ) + tb_reg = tb_reg[tb_reg["country"].isin(REGIONS)] # Add correction factor to estimate polio cases based on reported cases. tb = add_correction_factor(tb) tb["estimated_cases"] = tb["total_cases"] * tb["correction_factor"] From 41f047b58593d6b70857fb2d06b19bc1539833ba Mon Sep 17 00:00:00 2001 From: spoonerf Date: Fri, 12 Apr 2024 09:15:42 +0100 Subject: [PATCH 10/35] wip --- etl/steps/data/garden/who/2024-04-08/polio_afp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio_afp.py index 7b04fc09b40..ddb0fc73739 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.py @@ -59,6 +59,7 @@ def run(dest_dir: str) -> None: min_num_values_per_year=1, ) tb_reg = tb_reg[tb_reg["country"].isin(REGIONS)] + tb = pr.concat([tb, tb_reg], axis=0) # Add correction factor to estimate polio cases based on reported cases. tb = add_correction_factor(tb) tb["estimated_cases"] = tb["total_cases"] * tb["correction_factor"] From 59286a41cad950d317f21c0b4a41e5b3f1dd8424 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Fri, 12 Apr 2024 09:43:57 +0100 Subject: [PATCH 11/35] adding gpei fasttrack and metadat --- dag/health.yml | 1 + .../garden/who/2024-04-08/polio_afp.meta.yml | 24 +++++++++++++++++++ .../data/garden/who/2024-04-08/polio_afp.py | 6 +++++ 3 files changed, 31 insertions(+) diff --git a/dag/health.yml b/dag/health.yml index ea77d479cac..c4b308b84b1 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -521,6 +521,7 @@ steps: - data://meadow/who/2024-04-09/polio_historical - data://garden/wb/2023-04-30/income_groups - data://garden/regions/2023-01-01/regions + - snapshot://fasttrack/latest/gpei.csv data://grapher/who/2024-04-08/polio_afp: - data://garden/who/2024-04-08/polio_afp diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml index cefbce33ba4..6e133e7008f 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml @@ -73,3 +73,27 @@ tables: unit: cases display: numDecimalPlaces: 0 + cvdpv1: + title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV1) cases + description_short: "The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year" + unit: cases + display: + numDecimalPlaces: 0 + cvdpv2: + title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases + description_short: "The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year" + unit: cases + display: + numDecimalPlaces: 0 + cvdpv3: + title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases + description_short: "The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year" + unit: cases + display: + numDecimalPlaces: 0 + total_cvdpv: + title: Total Circulating Vaccine-Derived Poliovirus (cVDPV) cases + description_short: "The sum of circulating vaccine-derived poliovirus type 1, 2, and 3 cases" + unit: cases + display: + numDecimalPlaces: 0 diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio_afp.py index ddb0fc73739..a2796ef5d26 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/garden/who/2024-04-08/polio_afp.py @@ -23,7 +23,11 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("polio_afp") + # Load historical polio dataset ds_historical = paths.load_dataset("polio_historical") + # Load fasttrack Global Polio Eradication Initiative on circulating vaccine derived polio cases + snap_cvdpv = paths.load_snapshot("gpei.csv") + tb_cvdpv = snap_cvdpv.read() # Load regions dataset. ds_regions = paths.load_dataset("regions") # Load income groups dataset. @@ -50,6 +54,8 @@ def run(dest_dir: str) -> None: tb["total_cases"] = tb["wild_poliovirus_cases"] + tb["cvdpv_cases"] # Need to deal with overlapping years tb = pr.concat([tb, tb_hist], axis=0) + tb = tb.merge(tb_cvdpv, on=["country", "year"], how="left") + # Add region aggregates. tb_reg = add_regions_to_table( tb[["country", "year", "afp_cases", "wild_poliovirus_cases", "cvdpv_cases", "total_cases"]], From 6d01ef83f737c6558fa6d58111e741058347489a Mon Sep 17 00:00:00 2001 From: spoonerf Date: Fri, 12 Apr 2024 11:00:47 +0100 Subject: [PATCH 12/35] shuffling things about --- ...fp.countries.json => polio.countries.json} | 0 ...ies.json => polio.excluded_countries.json} | 0 .../{polio_afp.meta.yml => polio.meta.yml} | 8 +------- .../who/2024-04-08/{polio_afp.py => polio.py} | 20 ++++++++++++++++--- .../who/2024-04-08/{polio_afp.py => polio.py} | 4 ++-- 5 files changed, 20 insertions(+), 12 deletions(-) rename etl/steps/data/garden/who/2024-04-08/{polio_afp.countries.json => polio.countries.json} (100%) rename etl/steps/data/garden/who/2024-04-08/{polio_afp.excluded_countries.json => polio.excluded_countries.json} (100%) rename etl/steps/data/garden/who/2024-04-08/{polio_afp.meta.yml => polio.meta.yml} (94%) rename etl/steps/data/garden/who/2024-04-08/{polio_afp.py => polio.py} (93%) rename etl/steps/data/grapher/who/2024-04-08/{polio_afp.py => polio.py} (89%) diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json b/etl/steps/data/garden/who/2024-04-08/polio.countries.json similarity index 100% rename from etl/steps/data/garden/who/2024-04-08/polio_afp.countries.json rename to etl/steps/data/garden/who/2024-04-08/polio.countries.json diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json b/etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json similarity index 100% rename from etl/steps/data/garden/who/2024-04-08/polio_afp.excluded_countries.json rename to etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml similarity index 94% rename from etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml rename to etl/steps/data/garden/who/2024-04-08/polio.meta.yml index 6e133e7008f..01a8833fe56 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -12,7 +12,7 @@ dataset: update_period_days: 365 tables: - polio_afp: + polio: variables: afp_cases: title: Acute Flaccid Paralysis (AFP) cases @@ -91,9 +91,3 @@ tables: unit: cases display: numDecimalPlaces: 0 - total_cvdpv: - title: Total Circulating Vaccine-Derived Poliovirus (cVDPV) cases - description_short: "The sum of circulating vaccine-derived poliovirus type 1, 2, and 3 cases" - unit: cases - display: - numDecimalPlaces: 0 diff --git a/etl/steps/data/garden/who/2024-04-08/polio_afp.py b/etl/steps/data/garden/who/2024-04-08/polio.py similarity index 93% rename from etl/steps/data/garden/who/2024-04-08/polio_afp.py rename to etl/steps/data/garden/who/2024-04-08/polio.py index a2796ef5d26..bb2978dc637 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -21,13 +21,15 @@ def run(dest_dir: str) -> None: # # Load inputs. # - # Load meadow dataset. + # Load meadow acute flaccid paralysis dataset. ds_meadow = paths.load_dataset("polio_afp") # Load historical polio dataset ds_historical = paths.load_dataset("polio_historical") # Load fasttrack Global Polio Eradication Initiative on circulating vaccine derived polio cases snap_cvdpv = paths.load_snapshot("gpei.csv") tb_cvdpv = snap_cvdpv.read() + # Dropping this as the total_cvdpv is also in the polio_afp table and has more historical data + tb_cvdpv = tb_cvdpv.drop(columns=["total_cvdpv"]) # Load regions dataset. ds_regions = paths.load_dataset("regions") # Load income groups dataset. @@ -55,10 +57,21 @@ def run(dest_dir: str) -> None: # Need to deal with overlapping years tb = pr.concat([tb, tb_hist], axis=0) tb = tb.merge(tb_cvdpv, on=["country", "year"], how="left") - # Add region aggregates. tb_reg = add_regions_to_table( - tb[["country", "year", "afp_cases", "wild_poliovirus_cases", "cvdpv_cases", "total_cases"]], + tb[ + [ + "country", + "year", + "afp_cases", + "wild_poliovirus_cases", + "cvdpv_cases", + "total_cases", + "cvdpv1", + "cvdpv2", + "cvdpv3", + ] + ], regions=REGIONS, ds_regions=ds_regions, ds_income_groups=ds_income_groups, @@ -73,6 +86,7 @@ def run(dest_dir: str) -> None: tb = add_screening_and_testing(tb) tb = tb.set_index(["country", "year"], verify_integrity=True) + tb.metadata.short_name = "polio" # # Save outputs. diff --git a/etl/steps/data/grapher/who/2024-04-08/polio_afp.py b/etl/steps/data/grapher/who/2024-04-08/polio.py similarity index 89% rename from etl/steps/data/grapher/who/2024-04-08/polio_afp.py rename to etl/steps/data/grapher/who/2024-04-08/polio.py index 1c2467fb601..89a65e7d6af 100644 --- a/etl/steps/data/grapher/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/grapher/who/2024-04-08/polio.py @@ -11,10 +11,10 @@ def run(dest_dir: str) -> None: # Load inputs. # # Load garden dataset. - ds_garden = paths.load_dataset("polio_afp") + ds_garden = paths.load_dataset("polio") # Read table from garden dataset. - tb = ds_garden["polio_afp"] + tb = ds_garden["polio"] # # Process data. From 473096801382a32d5d77c8681c9540084d473d90 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Fri, 12 Apr 2024 12:56:33 +0100 Subject: [PATCH 13/35] adding certification snapshot --- dag/health.yml | 19 +++--- etl/steps/data/garden/who/2024-04-08/polio.py | 1 + .../health/2024-04-12/polio_status.csv.dvc | 26 ++++++++ snapshots/health/2024-04-12/polio_status.py | 59 +++++++++++++++++++ 4 files changed, 97 insertions(+), 8 deletions(-) create mode 100644 snapshots/health/2024-04-12/polio_status.csv.dvc create mode 100644 snapshots/health/2024-04-12/polio_status.py diff --git a/dag/health.yml b/dag/health.yml index c4b308b84b1..a00e9f29b94 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -516,17 +516,20 @@ steps: # Polio AFP surveillance data://meadow/who/2024-04-08/polio_afp: - snapshot://who/2024-04-08/polio_afp.csv - data://garden/who/2024-04-08/polio_afp: - - data://meadow/who/2024-04-08/polio_afp - - data://meadow/who/2024-04-09/polio_historical - - data://garden/wb/2023-04-30/income_groups - - data://garden/regions/2023-01-01/regions - - snapshot://fasttrack/latest/gpei.csv - data://grapher/who/2024-04-08/polio_afp: - - data://garden/who/2024-04-08/polio_afp # Polio historical data data://meadow/who/2024-04-09/polio_historical: - snapshot://who/2024-04-09/polio_historical.xls data://garden/who/2024-04-09/polio_historical: - data://meadow/who/2024-04-09/polio_historical + + # Combinging polio datasets + data://garden/who/2024-04-08/polio: + - data://meadow/who/2024-04-08/polio_afp + - data://meadow/who/2024-04-09/polio_historical + - data://garden/wb/2023-04-30/income_groups + - data://garden/regions/2023-01-01/regions + - snapshot://fasttrack/latest/gpei.csv + - snapshot://health/2024-04-12/polio_status.csv + data://grapher/who/2024-04-08/polio: + - data://garden/who/2024-04-08/polio diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index bb2978dc637..47c364b9b86 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -32,6 +32,7 @@ def run(dest_dir: str) -> None: tb_cvdpv = tb_cvdpv.drop(columns=["total_cvdpv"]) # Load regions dataset. ds_regions = paths.load_dataset("regions") + # Load income groups dataset. ds_income_groups = paths.load_dataset("income_groups") diff --git a/snapshots/health/2024-04-12/polio_status.csv.dvc b/snapshots/health/2024-04-12/polio_status.csv.dvc new file mode 100644 index 00000000000..141c9fa8e4f --- /dev/null +++ b/snapshots/health/2024-04-12/polio_status.csv.dvc @@ -0,0 +1,26 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Global Polio Eradication Initiative - Certification Status + date_published: "2021" + # Citation + producer: Global Polio Eradication Initiative + citation_full: |- + Global Polio Eradication Initiative (2024) + attribution: Global Polio Eradication Initiative (2024) + attribution_short: GPEI + + # Files + url_main: https://www.transplant-observatory.org/ + date_accessed: 2024-04-12 + + # License + license: + name: https://polioeradication.org/terms-of-use/ + url: https://polioeradication.org/terms-of-use/ +outs: + - md5: e9052b2095a1c01afe2f954eda183344 + size: 140 + path: polio_status.csv diff --git a/snapshots/health/2024-04-12/polio_status.py b/snapshots/health/2024-04-12/polio_status.py new file mode 100644 index 00000000000..9608a7c8ec6 --- /dev/null +++ b/snapshots/health/2024-04-12/polio_status.py @@ -0,0 +1,59 @@ +"""Script to create a snapshot of dataset. + +Data are transcribed from this webpage: + +https://polioeradication.org/polio-today/preparing-for-a-polio-free-world/certification/ + +""" + +from pathlib import Path + +import click +import pandas as pd +from owid.datautils.io import df_to_file + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"health/{SNAPSHOT_VERSION}/polio_status.csv") + + df = pd.DataFrame( + data={ + "who_region": [ + "Africa", + "Americas", + "South-East Asia", + "Europe", + "Eastern Mediterranean", + "Western Pacific", + ], + "year_certified_polio_free": [ + 2020, + 1994, + 2014, + 2002, + pd.NA, + 2000, + ], + } + ) + df_to_file(df, file_path=snap.path) + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() From ca911851dd9256c849e027e09867d9b8f00aacca Mon Sep 17 00:00:00 2001 From: spoonerf Date: Fri, 12 Apr 2024 13:04:57 +0100 Subject: [PATCH 14/35] adding meadow step for certification --- dag/health.yml | 4 +++ .../meadow/health/2024-04-12/polio_status.py | 27 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 etl/steps/data/meadow/health/2024-04-12/polio_status.py diff --git a/dag/health.yml b/dag/health.yml index a00e9f29b94..bee31a6e5f1 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -533,3 +533,7 @@ steps: - snapshot://health/2024-04-12/polio_status.csv data://grapher/who/2024-04-08/polio: - data://garden/who/2024-04-08/polio + + # Polio certification status + data://meadow/health/2024-04-12/polio_status: + - snapshot://health/2024-04-12/polio_status.csv diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_status.py b/etl/steps/data/meadow/health/2024-04-12/polio_status.py new file mode 100644 index 00000000000..b44471e4445 --- /dev/null +++ b/etl/steps/data/meadow/health/2024-04-12/polio_status.py @@ -0,0 +1,27 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("polio_status.csv") + tb = snap.read() + + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(keys=["who_region", "year_certified_polio_free"]) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() From fbb03b841819557e4a50e424ca2b5e555dfd545d Mon Sep 17 00:00:00 2001 From: spoonerf Date: Fri, 12 Apr 2024 13:18:46 +0100 Subject: [PATCH 15/35] adding polio free countries from gpei --- .../2024-04-12/polio_free_countries.csv.dvc | 26 +++++++++++++++ .../health/2024-04-12/polio_free_countries.py | 32 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 snapshots/health/2024-04-12/polio_free_countries.csv.dvc create mode 100644 snapshots/health/2024-04-12/polio_free_countries.py diff --git a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc new file mode 100644 index 00000000000..6b8ea00bbcf --- /dev/null +++ b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc @@ -0,0 +1,26 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Polio-Free Countries + date_published: "2016" + + # Citation + producer: Global Polio Eradication Initiative + citation_full: |- + Global Polio Eradication Initiative (2016) + + # Files + url_main: https://polioeradication.org/where-we-work/polio-free-countries/ + date_accessed: 2024-04-12 + + # License + license: + name: CC BY-NC-SA 3.0 IGO + url: https://polioeradication.org/terms-of-use/ + +outs: + - md5: 3670959ed02bcdb84fc0080734fc1bf7 + size: 5123 + path: polio_free_countries.csv diff --git a/snapshots/health/2024-04-12/polio_free_countries.py b/snapshots/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..74d8e6c0646 --- /dev/null +++ b/snapshots/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,32 @@ +"""Script to create a snapshot of dataset. + +The data is from this page: https://polioeradication.org/where-we-work/polio-free-countries/ + +The table was copied into a csv and rearranged so that it only has two columns, country and year. + +Then this was uploaded to snapshot. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"health/{SNAPSHOT_VERSION}/polio_free_countries.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() From 26a585bf311b8a350767fbc0845a6d81dd4958a5 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Fri, 12 Apr 2024 15:58:24 +0100 Subject: [PATCH 16/35] adding polio free countries --- dag/health.yml | 11 +- .../2024-04-12/polio_free_countries.json | 196 ++++++++++++++++++ .../health/2024-04-12/polio_free_countries.py | 27 +++ .../2024-04-12/polio_free_countries.yml | 175 ++++++++++++++++ .../health/2024-04-12/polio_status.json | 196 ++++++++++++++++++ .../garden/health/2024-04-12/polio_status.py | 28 +++ .../garden/health/2024-04-12/polio_status.yml | 175 ++++++++++++++++ .../health/2024-04-12/polio_free_countries.py | 22 ++ 8 files changed, 829 insertions(+), 1 deletion(-) create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_free_countries.json create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_free_countries.py create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_status.json create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_status.py create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_status.yml create mode 100644 etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py diff --git a/dag/health.yml b/dag/health.yml index bee31a6e5f1..a769ad6213f 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -523,7 +523,7 @@ steps: data://garden/who/2024-04-09/polio_historical: - data://meadow/who/2024-04-09/polio_historical - # Combinging polio datasets + # Combining polio datasets data://garden/who/2024-04-08/polio: - data://meadow/who/2024-04-08/polio_afp - data://meadow/who/2024-04-09/polio_historical @@ -535,5 +535,14 @@ steps: - data://garden/who/2024-04-08/polio # Polio certification status + data://garden/health/2024-04-12/polio_status: + - data://meadow/health/2024-04-12/polio_status + - data://meadow/health/2024-04-12/polio_free_countries data://meadow/health/2024-04-12/polio_status: - snapshot://health/2024-04-12/polio_status.csv + + # Polio free countries + data://garden/health/2024-04-12/polio_free_countries: + - data://meadow/health/2024-04-12/polio_free_countries + data://meadow/health/2024-04-12/polio_free_countries: + - snapshot://health/2024-04-12/polio_free_countries.csv diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.json b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.json new file mode 100644 index 00000000000..2b2b53734b2 --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.json @@ -0,0 +1,196 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d\u2019Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of The Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Lao People's Democratic Republic ": "Laos", + "Libyan Arab Jamahiriya": "Libya", + "Republic of North Macedonia": "North Macedonia", + "T\u00fcrkiye ": "Turkey" +} \ No newline at end of file diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..87d5f7295d2 --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,27 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("polio_free_countries") + tb = ds_meadow["polio_free_countries"].reset_index() + + # Combine with polio free countries. + + # Set an index and sort. + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml new file mode 100644 index 00000000000..4c74ee6c2e1 --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml @@ -0,0 +1,175 @@ +definitions: + common: + presentation: + attribution_short: GODT + topic_tags: + - Global Health + processing_level: minor + unit: "" + short_unit: "" + display: + numDecimalPlaces: 0 + +dataset: + update_period_days: 365 + +tables: + organ_donation_and_transplantation: + variables: + n_organ_donors: + title: Total number of actual deceased organ donors + description_short: |- + Actual donors are those individuals from whom one or more organs have been recovered for the purpose of transplantation. + n_donors_after_brain_death: + title: Number of actual donors after brain death + description_short: Deceased organ donors in whom death has been determined by neurological criteria. + n_donors_after_circulatory_death: + title: Number of actual donors after circulatory death + description_short: Deceased organ donors in whom death has been determined by circulatory criteria. + n_utilized_organ_donors: + title: Total number of utilized deceased organ donors + n_utilized_donors_after_brain_death: + title: Number of utilized donors after brain death + description_short: Utilized deceased organ donors in whom death has been determined by neurological criteria. + n_utilized_donors_after_circulatory_death: + title: Number of utilized donors after circulatory death + description_short: Utilized deceased organ donors in whom death has been determined by circulatory criteria. + n_kidney_transplantation_from_deceased_persons: + title: Number of kidney transplantations from deceased persons + n_kidney_transplantation_from_living_persons: + title: Number of kidney transplantations from living persons + n_kidney_transplantation: + title: Total number of kidney transplantations + n_liver_transplantation_from_deceased_persons: + title: Number of liver transplantations from deceased persons + n_liver_transplantation_from_living_persons: + title: Number of liver transplantations from living persons + n_domino_liver_transplantation: + title: Number of domino liver transplantations + description_short: |- + A domino liver transplantation is a surgical procedure in which a liver from a patient with a metabolic disorder is transplanted into a recipient, while the liver of the recipient is transplanted into another recipient. + n_liver_transplantation: + title: Total number of liver transplantations + n_heart_transplantation: + title: Total number of heart transplantations + n_lung_transplantation_from_deceased_persons: + title: Number of lung transplantations from deceased persons + n_lung_transplantation_from_living_persons: + title: Number of lung transplantations from living persons + n_lung_transplantation: + title: Total number of lung transplantations + n_pancreas_transplantation: + title: Total number of pancreas transplantations + n_kidney_pancreas_transplantation: + title: Total number of kidney-pancreas transplantations + n_small_bowel_transplantation: + title: Total number of small bowel transplantations + # Add indicators per million people. + n_organ_donors_per_million_people: + title: Total number of actual deceased organ donors per million people + description_short: |- + Actual donors are those individuals from whom one or more organs have been recovered for the purpose of transplantation. + display: + numDecimalPlaces: 1 + processing_level: major + n_donors_after_brain_death_per_million_people: + title: Number of actual donors after brain death per million people + description_short: Deceased organ donors in whom death has been determined by neurological criteria. + processing_level: major + display: + numDecimalPlaces: 1 + n_donors_after_circulatory_death_per_million_people: + title: Number of actual donors after circulatory death per million people + description_short: Deceased organ donors in whom death has been determined by circulatory criteria. + processing_level: major + display: + numDecimalPlaces: 1 + n_utilized_organ_donors_per_million_people: + title: Total number of utilized deceased organ donors per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_utilized_donors_after_brain_death_per_million_people: + title: Number of utilized donors after brain death per million people + description_short: Utilized deceased organ donors in whom death has been determined by neurological criteria. + processing_level: major + display: + numDecimalPlaces: 1 + n_utilized_donors_after_circulatory_death_per_million_people: + title: Number of utilized donors after circulatory death per million people + description_short: Utilized deceased organ donors in whom death has been determined by circulatory criteria. + processing_level: major + display: + numDecimalPlaces: 1 + n_kidney_transplantation_from_deceased_persons_per_million_people: + title: Number of kidney transplantations from deceased persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_kidney_transplantation_from_living_persons_per_million_people: + title: Number of kidney transplantations from living persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_kidney_transplantation_per_million_people: + title: Total number of kidney transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_liver_transplantation_from_deceased_persons_per_million_people: + title: Number of liver transplantations from deceased persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_liver_transplantation_from_living_persons_per_million_people: + title: Number of liver transplantations from living persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_domino_liver_transplantation_per_million_people: + title: Number of domino liver transplantations per million people + description_short: |- + A domino liver transplantation is a surgical procedure in which a liver from a patient with a metabolic disorder is transplanted into a recipient, while the liver of the recipient is transplanted into another recipient. + processing_level: major + display: + numDecimalPlaces: 1 + n_liver_transplantation_per_million_people: + title: Total number of liver transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_heart_transplantation_per_million_people: + title: Total number of heart transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_lung_transplantation_from_deceased_persons_per_million_people: + title: Number of lung transplantations from deceased persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_lung_transplantation_from_living_persons_per_million_people: + title: Number of lung transplantations from living persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_lung_transplantation_per_million_people: + title: Total number of lung transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_pancreas_transplantation_per_million_people: + title: Total number of pancreas transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_kidney_pancreas_transplantation_per_million_people: + title: Total number of kidney-pancreas transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_small_bowel_transplantation_per_million_people: + title: Total number of small bowel transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 diff --git a/etl/steps/data/garden/health/2024-04-12/polio_status.json b/etl/steps/data/garden/health/2024-04-12/polio_status.json new file mode 100644 index 00000000000..2b2b53734b2 --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_status.json @@ -0,0 +1,196 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d\u2019Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of The Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Lao People's Democratic Republic ": "Laos", + "Libyan Arab Jamahiriya": "Libya", + "Republic of North Macedonia": "North Macedonia", + "T\u00fcrkiye ": "Turkey" +} \ No newline at end of file diff --git a/etl/steps/data/garden/health/2024-04-12/polio_status.py b/etl/steps/data/garden/health/2024-04-12/polio_status.py new file mode 100644 index 00000000000..91e2cfa2cd4 --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_status.py @@ -0,0 +1,28 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("polio_status") + ds_polio_free = paths.load_dataset("polio_free_countries") + tb = ds_meadow["polio_status"].reset_index() + + # Combine with polio free countries. + + # Set an index and sort. + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/health/2024-04-12/polio_status.yml b/etl/steps/data/garden/health/2024-04-12/polio_status.yml new file mode 100644 index 00000000000..4c74ee6c2e1 --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_status.yml @@ -0,0 +1,175 @@ +definitions: + common: + presentation: + attribution_short: GODT + topic_tags: + - Global Health + processing_level: minor + unit: "" + short_unit: "" + display: + numDecimalPlaces: 0 + +dataset: + update_period_days: 365 + +tables: + organ_donation_and_transplantation: + variables: + n_organ_donors: + title: Total number of actual deceased organ donors + description_short: |- + Actual donors are those individuals from whom one or more organs have been recovered for the purpose of transplantation. + n_donors_after_brain_death: + title: Number of actual donors after brain death + description_short: Deceased organ donors in whom death has been determined by neurological criteria. + n_donors_after_circulatory_death: + title: Number of actual donors after circulatory death + description_short: Deceased organ donors in whom death has been determined by circulatory criteria. + n_utilized_organ_donors: + title: Total number of utilized deceased organ donors + n_utilized_donors_after_brain_death: + title: Number of utilized donors after brain death + description_short: Utilized deceased organ donors in whom death has been determined by neurological criteria. + n_utilized_donors_after_circulatory_death: + title: Number of utilized donors after circulatory death + description_short: Utilized deceased organ donors in whom death has been determined by circulatory criteria. + n_kidney_transplantation_from_deceased_persons: + title: Number of kidney transplantations from deceased persons + n_kidney_transplantation_from_living_persons: + title: Number of kidney transplantations from living persons + n_kidney_transplantation: + title: Total number of kidney transplantations + n_liver_transplantation_from_deceased_persons: + title: Number of liver transplantations from deceased persons + n_liver_transplantation_from_living_persons: + title: Number of liver transplantations from living persons + n_domino_liver_transplantation: + title: Number of domino liver transplantations + description_short: |- + A domino liver transplantation is a surgical procedure in which a liver from a patient with a metabolic disorder is transplanted into a recipient, while the liver of the recipient is transplanted into another recipient. + n_liver_transplantation: + title: Total number of liver transplantations + n_heart_transplantation: + title: Total number of heart transplantations + n_lung_transplantation_from_deceased_persons: + title: Number of lung transplantations from deceased persons + n_lung_transplantation_from_living_persons: + title: Number of lung transplantations from living persons + n_lung_transplantation: + title: Total number of lung transplantations + n_pancreas_transplantation: + title: Total number of pancreas transplantations + n_kidney_pancreas_transplantation: + title: Total number of kidney-pancreas transplantations + n_small_bowel_transplantation: + title: Total number of small bowel transplantations + # Add indicators per million people. + n_organ_donors_per_million_people: + title: Total number of actual deceased organ donors per million people + description_short: |- + Actual donors are those individuals from whom one or more organs have been recovered for the purpose of transplantation. + display: + numDecimalPlaces: 1 + processing_level: major + n_donors_after_brain_death_per_million_people: + title: Number of actual donors after brain death per million people + description_short: Deceased organ donors in whom death has been determined by neurological criteria. + processing_level: major + display: + numDecimalPlaces: 1 + n_donors_after_circulatory_death_per_million_people: + title: Number of actual donors after circulatory death per million people + description_short: Deceased organ donors in whom death has been determined by circulatory criteria. + processing_level: major + display: + numDecimalPlaces: 1 + n_utilized_organ_donors_per_million_people: + title: Total number of utilized deceased organ donors per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_utilized_donors_after_brain_death_per_million_people: + title: Number of utilized donors after brain death per million people + description_short: Utilized deceased organ donors in whom death has been determined by neurological criteria. + processing_level: major + display: + numDecimalPlaces: 1 + n_utilized_donors_after_circulatory_death_per_million_people: + title: Number of utilized donors after circulatory death per million people + description_short: Utilized deceased organ donors in whom death has been determined by circulatory criteria. + processing_level: major + display: + numDecimalPlaces: 1 + n_kidney_transplantation_from_deceased_persons_per_million_people: + title: Number of kidney transplantations from deceased persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_kidney_transplantation_from_living_persons_per_million_people: + title: Number of kidney transplantations from living persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_kidney_transplantation_per_million_people: + title: Total number of kidney transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_liver_transplantation_from_deceased_persons_per_million_people: + title: Number of liver transplantations from deceased persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_liver_transplantation_from_living_persons_per_million_people: + title: Number of liver transplantations from living persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_domino_liver_transplantation_per_million_people: + title: Number of domino liver transplantations per million people + description_short: |- + A domino liver transplantation is a surgical procedure in which a liver from a patient with a metabolic disorder is transplanted into a recipient, while the liver of the recipient is transplanted into another recipient. + processing_level: major + display: + numDecimalPlaces: 1 + n_liver_transplantation_per_million_people: + title: Total number of liver transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_heart_transplantation_per_million_people: + title: Total number of heart transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_lung_transplantation_from_deceased_persons_per_million_people: + title: Number of lung transplantations from deceased persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_lung_transplantation_from_living_persons_per_million_people: + title: Number of lung transplantations from living persons per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_lung_transplantation_per_million_people: + title: Total number of lung transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_pancreas_transplantation_per_million_people: + title: Total number of pancreas transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_kidney_pancreas_transplantation_per_million_people: + title: Total number of kidney-pancreas transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 + n_small_bowel_transplantation_per_million_people: + title: Total number of small bowel transplantations per million people + processing_level: major + display: + numDecimalPlaces: 1 diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..87eb6597925 --- /dev/null +++ b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,22 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("polio_free_countries.csv") + tb = snap.read() + tb = tb[["country", "year"]].set_index(["country", "year"], verify_integrity=True) + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() From 631a0617d5e1a21eefd32ea823f032ea15353e5b Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 15 Apr 2024 10:10:40 +0100 Subject: [PATCH 17/35] removing garden step polion status --- dag/health.yml | 4 +- .../health/2024-04-12/polio_free_countries.py | 37 +++- .../health/2024-04-12/polio_status.json | 196 ------------------ .../garden/health/2024-04-12/polio_status.py | 28 --- .../garden/health/2024-04-12/polio_status.yml | 175 ---------------- 5 files changed, 36 insertions(+), 404 deletions(-) delete mode 100644 etl/steps/data/garden/health/2024-04-12/polio_status.json delete mode 100644 etl/steps/data/garden/health/2024-04-12/polio_status.py delete mode 100644 etl/steps/data/garden/health/2024-04-12/polio_status.yml diff --git a/dag/health.yml b/dag/health.yml index a769ad6213f..c2b0d21f7d5 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -535,14 +535,12 @@ steps: - data://garden/who/2024-04-08/polio # Polio certification status - data://garden/health/2024-04-12/polio_status: - - data://meadow/health/2024-04-12/polio_status - - data://meadow/health/2024-04-12/polio_free_countries data://meadow/health/2024-04-12/polio_status: - snapshot://health/2024-04-12/polio_status.csv # Polio free countries data://garden/health/2024-04-12/polio_free_countries: + - data://meadow/health/2024-04-12/polio_status - data://meadow/health/2024-04-12/polio_free_countries data://meadow/health/2024-04-12/polio_free_countries: - snapshot://health/2024-04-12/polio_free_countries.csv diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index 87d5f7295d2..07ea6152a20 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -1,10 +1,16 @@ """Load a meadow dataset and create a garden dataset.""" +from itertools import product + +from owid.catalog import Table + from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +LATEST_YEAR = 2023 + def run(dest_dir: str) -> None: # @@ -12,10 +18,11 @@ def run(dest_dir: str) -> None: # # Load meadow dataset and read its main table. ds_meadow = paths.load_dataset("polio_free_countries") + ds_region_status = paths.load_dataset(short_name="polio_status", channel="meadow") tb = ds_meadow["polio_free_countries"].reset_index() - # Combine with polio free countries. - + # Assign polio free countries. + tb = define_polio_free(tb) # Set an index and sort. tb = tb.format() @@ -25,3 +32,29 @@ def run(dest_dir: str) -> None: # Create a new garden dataset. ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) ds_garden.save() + + +def define_polio_free(tb: Table) -> Table: + """Define the polio free countries table.""" + + # Clean the data + tb["year"] = tb["year"].astype(str) + # Drop countries with missing values + tb = tb[tb["year"] != "data not available"] + # Change pre 1985 to 1984 + tb["year"] = tb["year"].replace("pre 1985", "1984") + # Change ongoing to LATEST_YEAR + 1 + tb["year"] = tb["year"].replace("ongoing", LATEST_YEAR + 1) + tb["year"] = tb["year"].astype(int) + + years = list(range(1910, LATEST_YEAR + 1)) + tb_prod = Table(product(tb["country"], years), columns=["country", "year"]) + + tb_prod["status"] = tb_prod.apply( + lambda row: "Endemic" + if row["year"] < tb[tb["country"] == row["country"]]["year"].values[0] + else "Polio-free (not certified)", + axis=1, + ) + + return tb_prod diff --git a/etl/steps/data/garden/health/2024-04-12/polio_status.json b/etl/steps/data/garden/health/2024-04-12/polio_status.json deleted file mode 100644 index 2b2b53734b2..00000000000 --- a/etl/steps/data/garden/health/2024-04-12/polio_status.json +++ /dev/null @@ -1,196 +0,0 @@ -{ - "Afghanistan": "Afghanistan", - "Albania": "Albania", - "Algeria": "Algeria", - "Andorra": "Andorra", - "Angola": "Angola", - "Antigua and Barbuda": "Antigua and Barbuda", - "Argentina": "Argentina", - "Armenia": "Armenia", - "Australia": "Australia", - "Austria": "Austria", - "Azerbaijan": "Azerbaijan", - "Bahamas": "Bahamas", - "Bahrain": "Bahrain", - "Bangladesh": "Bangladesh", - "Barbados": "Barbados", - "Belarus": "Belarus", - "Belgium": "Belgium", - "Belize": "Belize", - "Benin": "Benin", - "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", - "Bosnia and Herzegovina": "Bosnia and Herzegovina", - "Botswana": "Botswana", - "Brazil": "Brazil", - "Brunei Darussalam": "Brunei", - "Bulgaria": "Bulgaria", - "Burkina Faso": "Burkina Faso", - "Burundi": "Burundi", - "Cambodia": "Cambodia", - "Cameroon": "Cameroon", - "Canada": "Canada", - "Cape Verde": "Cape Verde", - "Central African Republic": "Central African Republic", - "Chad": "Chad", - "Chile": "Chile", - "China": "China", - "Colombia": "Colombia", - "Comoros": "Comoros", - "Congo": "Congo", - "Cook Islands": "Cook Islands", - "Costa Rica": "Costa Rica", - "Croatia": "Croatia", - "Cuba": "Cuba", - "Cyprus": "Cyprus", - "Czech Republic": "Czechia", - "C\u00f4te d\u2019Ivoire": "Cote d'Ivoire", - "Democratic People's Republic of Korea": "North Korea", - "Democratic Republic of The Congo": "Democratic Republic of Congo", - "Denmark": "Denmark", - "Djibouti": "Djibouti", - "Dominica": "Dominica", - "Dominican Republic": "Dominican Republic", - "Ecuador": "Ecuador", - "Egypt": "Egypt", - "El Salvador": "El Salvador", - "Equatorial Guinea": "Equatorial Guinea", - "Eritrea": "Eritrea", - "Estonia": "Estonia", - "Eswatini": "Eswatini", - "Ethiopia": "Ethiopia", - "Fiji": "Fiji", - "Finland": "Finland", - "France": "France", - "Gabon": "Gabon", - "Gambia": "Gambia", - "Georgia": "Georgia", - "Germany": "Germany", - "Ghana": "Ghana", - "Greece": "Greece", - "Grenada": "Grenada", - "Guatemala": "Guatemala", - "Guinea": "Guinea", - "Guinea-Bissau": "Guinea-Bissau", - "Guyana": "Guyana", - "Haiti": "Haiti", - "Honduras": "Honduras", - "Hungary": "Hungary", - "Iceland": "Iceland", - "India": "India", - "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", - "Iraq": "Iraq", - "Ireland": "Ireland", - "Israel": "Israel", - "Italy": "Italy", - "Jamaica": "Jamaica", - "Japan": "Japan", - "Jordan": "Jordan", - "Kazakhstan": "Kazakhstan", - "Kenya": "Kenya", - "Kiribati": "Kiribati", - "Kuwait": "Kuwait", - "Kyrgyzstan": "Kyrgyzstan", - "Latvia": "Latvia", - "Lebanon": "Lebanon", - "Lesotho": "Lesotho", - "Liberia": "Liberia", - "Lithuania": "Lithuania", - "Luxembourg": "Luxembourg", - "Madagascar": "Madagascar", - "Malawi": "Malawi", - "Malaysia": "Malaysia", - "Maldives": "Maldives", - "Mali": "Mali", - "Malta": "Malta", - "Marshall Islands": "Marshall Islands", - "Mauritania": "Mauritania", - "Mauritius": "Mauritius", - "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", - "Monaco": "Monaco", - "Mongolia": "Mongolia", - "Montenegro": "Montenegro", - "Morocco": "Morocco", - "Mozambique": "Mozambique", - "Myanmar": "Myanmar", - "Namibia": "Namibia", - "Nauru": "Nauru", - "Nepal": "Nepal", - "Netherlands": "Netherlands", - "New Zealand": "New Zealand", - "Nicaragua": "Nicaragua", - "Niger": "Niger", - "Nigeria": "Nigeria", - "Niue": "Niue", - "Norway": "Norway", - "Oman": "Oman", - "Pakistan": "Pakistan", - "Palau": "Palau", - "Panama": "Panama", - "Papua New Guinea": "Papua New Guinea", - "Paraguay": "Paraguay", - "Peru": "Peru", - "Philippines": "Philippines", - "Poland": "Poland", - "Portugal": "Portugal", - "Qatar": "Qatar", - "Republic of Korea": "South Korea", - "Republic of Moldova": "Moldova", - "Romania": "Romania", - "Russian Federation": "Russia", - "Rwanda": "Rwanda", - "Saint Kitts and Nevis": "Saint Kitts and Nevis", - "Saint Lucia": "Saint Lucia", - "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", - "Samoa": "Samoa", - "San Marino": "San Marino", - "Sao Tome and Principe": "Sao Tome and Principe", - "Saudi Arabia": "Saudi Arabia", - "Senegal": "Senegal", - "Serbia": "Serbia", - "Seychelles": "Seychelles", - "Sierra Leone": "Sierra Leone", - "Singapore": "Singapore", - "Slovakia": "Slovakia", - "Slovenia": "Slovenia", - "Solomon Islands": "Solomon Islands", - "Somalia": "Somalia", - "South Africa": "South Africa", - "South Sudan": "South Sudan", - "Spain": "Spain", - "Sri Lanka": "Sri Lanka", - "Sudan": "Sudan", - "Suriname": "Suriname", - "Sweden": "Sweden", - "Switzerland": "Switzerland", - "Syrian Arab Republic": "Syria", - "Tajikistan": "Tajikistan", - "Thailand": "Thailand", - "Timor-Leste": "East Timor", - "Togo": "Togo", - "Tonga": "Tonga", - "Trinidad and Tobago": "Trinidad and Tobago", - "Tunisia": "Tunisia", - "Turkmenistan": "Turkmenistan", - "Tuvalu": "Tuvalu", - "Uganda": "Uganda", - "Ukraine": "Ukraine", - "United Arab Emirates": "United Arab Emirates", - "United Kingdom": "United Kingdom", - "United Republic of Tanzania": "Tanzania", - "United States of America": "United States", - "Uruguay": "Uruguay", - "Uzbekistan": "Uzbekistan", - "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", - "Viet Nam": "Vietnam", - "Yemen": "Yemen", - "Zambia": "Zambia", - "Zimbabwe": "Zimbabwe", - "Lao People's Democratic Republic ": "Laos", - "Libyan Arab Jamahiriya": "Libya", - "Republic of North Macedonia": "North Macedonia", - "T\u00fcrkiye ": "Turkey" -} \ No newline at end of file diff --git a/etl/steps/data/garden/health/2024-04-12/polio_status.py b/etl/steps/data/garden/health/2024-04-12/polio_status.py deleted file mode 100644 index 91e2cfa2cd4..00000000000 --- a/etl/steps/data/garden/health/2024-04-12/polio_status.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Load a meadow dataset and create a garden dataset.""" - -from etl.helpers import PathFinder, create_dataset - -# Get paths and naming conventions for current step. -paths = PathFinder(__file__) - - -def run(dest_dir: str) -> None: - # - # Load inputs. - # - # Load meadow dataset and read its main table. - ds_meadow = paths.load_dataset("polio_status") - ds_polio_free = paths.load_dataset("polio_free_countries") - tb = ds_meadow["polio_status"].reset_index() - - # Combine with polio free countries. - - # Set an index and sort. - tb = tb.format() - - # - # Save outputs. - # - # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) - ds_garden.save() diff --git a/etl/steps/data/garden/health/2024-04-12/polio_status.yml b/etl/steps/data/garden/health/2024-04-12/polio_status.yml deleted file mode 100644 index 4c74ee6c2e1..00000000000 --- a/etl/steps/data/garden/health/2024-04-12/polio_status.yml +++ /dev/null @@ -1,175 +0,0 @@ -definitions: - common: - presentation: - attribution_short: GODT - topic_tags: - - Global Health - processing_level: minor - unit: "" - short_unit: "" - display: - numDecimalPlaces: 0 - -dataset: - update_period_days: 365 - -tables: - organ_donation_and_transplantation: - variables: - n_organ_donors: - title: Total number of actual deceased organ donors - description_short: |- - Actual donors are those individuals from whom one or more organs have been recovered for the purpose of transplantation. - n_donors_after_brain_death: - title: Number of actual donors after brain death - description_short: Deceased organ donors in whom death has been determined by neurological criteria. - n_donors_after_circulatory_death: - title: Number of actual donors after circulatory death - description_short: Deceased organ donors in whom death has been determined by circulatory criteria. - n_utilized_organ_donors: - title: Total number of utilized deceased organ donors - n_utilized_donors_after_brain_death: - title: Number of utilized donors after brain death - description_short: Utilized deceased organ donors in whom death has been determined by neurological criteria. - n_utilized_donors_after_circulatory_death: - title: Number of utilized donors after circulatory death - description_short: Utilized deceased organ donors in whom death has been determined by circulatory criteria. - n_kidney_transplantation_from_deceased_persons: - title: Number of kidney transplantations from deceased persons - n_kidney_transplantation_from_living_persons: - title: Number of kidney transplantations from living persons - n_kidney_transplantation: - title: Total number of kidney transplantations - n_liver_transplantation_from_deceased_persons: - title: Number of liver transplantations from deceased persons - n_liver_transplantation_from_living_persons: - title: Number of liver transplantations from living persons - n_domino_liver_transplantation: - title: Number of domino liver transplantations - description_short: |- - A domino liver transplantation is a surgical procedure in which a liver from a patient with a metabolic disorder is transplanted into a recipient, while the liver of the recipient is transplanted into another recipient. - n_liver_transplantation: - title: Total number of liver transplantations - n_heart_transplantation: - title: Total number of heart transplantations - n_lung_transplantation_from_deceased_persons: - title: Number of lung transplantations from deceased persons - n_lung_transplantation_from_living_persons: - title: Number of lung transplantations from living persons - n_lung_transplantation: - title: Total number of lung transplantations - n_pancreas_transplantation: - title: Total number of pancreas transplantations - n_kidney_pancreas_transplantation: - title: Total number of kidney-pancreas transplantations - n_small_bowel_transplantation: - title: Total number of small bowel transplantations - # Add indicators per million people. - n_organ_donors_per_million_people: - title: Total number of actual deceased organ donors per million people - description_short: |- - Actual donors are those individuals from whom one or more organs have been recovered for the purpose of transplantation. - display: - numDecimalPlaces: 1 - processing_level: major - n_donors_after_brain_death_per_million_people: - title: Number of actual donors after brain death per million people - description_short: Deceased organ donors in whom death has been determined by neurological criteria. - processing_level: major - display: - numDecimalPlaces: 1 - n_donors_after_circulatory_death_per_million_people: - title: Number of actual donors after circulatory death per million people - description_short: Deceased organ donors in whom death has been determined by circulatory criteria. - processing_level: major - display: - numDecimalPlaces: 1 - n_utilized_organ_donors_per_million_people: - title: Total number of utilized deceased organ donors per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_utilized_donors_after_brain_death_per_million_people: - title: Number of utilized donors after brain death per million people - description_short: Utilized deceased organ donors in whom death has been determined by neurological criteria. - processing_level: major - display: - numDecimalPlaces: 1 - n_utilized_donors_after_circulatory_death_per_million_people: - title: Number of utilized donors after circulatory death per million people - description_short: Utilized deceased organ donors in whom death has been determined by circulatory criteria. - processing_level: major - display: - numDecimalPlaces: 1 - n_kidney_transplantation_from_deceased_persons_per_million_people: - title: Number of kidney transplantations from deceased persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_kidney_transplantation_from_living_persons_per_million_people: - title: Number of kidney transplantations from living persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_kidney_transplantation_per_million_people: - title: Total number of kidney transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_liver_transplantation_from_deceased_persons_per_million_people: - title: Number of liver transplantations from deceased persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_liver_transplantation_from_living_persons_per_million_people: - title: Number of liver transplantations from living persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_domino_liver_transplantation_per_million_people: - title: Number of domino liver transplantations per million people - description_short: |- - A domino liver transplantation is a surgical procedure in which a liver from a patient with a metabolic disorder is transplanted into a recipient, while the liver of the recipient is transplanted into another recipient. - processing_level: major - display: - numDecimalPlaces: 1 - n_liver_transplantation_per_million_people: - title: Total number of liver transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_heart_transplantation_per_million_people: - title: Total number of heart transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_lung_transplantation_from_deceased_persons_per_million_people: - title: Number of lung transplantations from deceased persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_lung_transplantation_from_living_persons_per_million_people: - title: Number of lung transplantations from living persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_lung_transplantation_per_million_people: - title: Total number of lung transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_pancreas_transplantation_per_million_people: - title: Total number of pancreas transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_kidney_pancreas_transplantation_per_million_people: - title: Total number of kidney-pancreas transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_small_bowel_transplantation_per_million_people: - title: Total number of small bowel transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 From 55ed6200575e6f84c9be66f6894fb364028ff02a Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 09:32:10 +0100 Subject: [PATCH 18/35] adding population dependencies --- dag/health.yml | 1 + etl/steps/data/garden/who/2024-04-08/polio.py | 22 +++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/dag/health.yml b/dag/health.yml index c2b0d21f7d5..b2d93aaaf55 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -529,6 +529,7 @@ steps: - data://meadow/who/2024-04-09/polio_historical - data://garden/wb/2023-04-30/income_groups - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population - snapshot://fasttrack/latest/gpei.csv - snapshot://health/2024-04-12/polio_status.csv data://grapher/who/2024-04-08/polio: diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index 47c364b9b86..961bf997244 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -25,6 +25,9 @@ def run(dest_dir: str) -> None: ds_meadow = paths.load_dataset("polio_afp") # Load historical polio dataset ds_historical = paths.load_dataset("polio_historical") + # Load population data to calculate cases per million population + ds_population = paths.load_dataset("population") + tb_population = ds_population["population"] # Load fasttrack Global Polio Eradication Initiative on circulating vaccine derived polio cases snap_cvdpv = paths.load_snapshot("gpei.csv") tb_cvdpv = snap_cvdpv.read() @@ -101,6 +104,17 @@ def run(dest_dir: str) -> None: ds_garden.save() +def cases_per_million(tb: Table, ds_population: Table) -> Table: + """ + Add cases per million population for each country. + """ + tb = tb.reset_index() + tb = tb.merge(ds_population, on=["country", "year"], how="left") + tb["cases_per_million"] = tb["total_cases"] / tb["population"] * 1_000_000 + tb = tb.set_index(["country", "year"], verify_integrity=True) + return tb + + def add_screening_and_testing(tb: Table, year=SCREENING_YEAR) -> Table: """ Adds the polio surveillance status based on the screening and testing rates. @@ -174,10 +188,10 @@ def add_correction_factor(tb: Table) -> Table: # tb.loc[tb["year"] == 2021, "correction_factor"] = np.nan # Namibia had 'percent_adequate_stool_collection' > 100 in 2011 and 2014 but for other years it's correction factor is 1.11 so we set it as 1.11 for 2011 and 2014. - tb["correction_factor"][(tb["country"] == "Namibia") & (tb["year"].isin([2011, 2014]))] = 1.11 + tb.loc[(tb["country"] == "Namibia") & (tb["year"].isin([2011, 2014])), "correction_factor"] = 1.11 # For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988. - tb["correction_factor"][(tb["country"] == "China") & (tb["year"].isin([1989, 1990, 1991, 1992]))] = 1.11 - tb["correction_factor"][(tb["country"] == "Oman") & (tb["year"].isin([1988]))] = 1.11 + tb.loc[(tb["country"] == "China") & (tb["year"].isin([1989, 1990, 1991, 1992])), "correction_factor"] = 1.11 + tb.loc[(tb["country"] == "Oman") & (tb["year"].isin([1988])), "correction_factor"] = 1.11 # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise tb["correction_factor"].metadata.origins = tb["non_polio_afp_rate"].metadata.origins return tb @@ -187,7 +201,7 @@ def clean_adequate_stool_collection(tb: Table) -> Table: """ Some values for "Adequate stool collection" are over 100%, we should set these to NA. """ - tb["pct_adequate_stool_collection"][tb["pct_adequate_stool_collection"] > 100] = pd.NA + tb.loc[tb["pct_adequate_stool_collection"] > 100, "pct_adequate_stool_collection"] = pd.NA return tb From 985e043593f8bec400d8771d768199416f9443e5 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 10:46:36 +0100 Subject: [PATCH 19/35] adding per million and harmonizing historical data too --- .../who/2024-04-08/polio.countries.json | 37 ++++++++++++++ .../data/garden/who/2024-04-08/polio.meta.yml | 49 +++++++++++++++++++ etl/steps/data/garden/who/2024-04-08/polio.py | 41 ++++++++++------ 3 files changed, 111 insertions(+), 16 deletions(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio.countries.json b/etl/steps/data/garden/who/2024-04-08/polio.countries.json index c8fc7c38076..1a59d51c260 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.countries.json +++ b/etl/steps/data/garden/who/2024-04-08/polio.countries.json @@ -4,25 +4,32 @@ "Algeria": "Algeria", "Andorra": "Andorra", "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", "Argentina": "Argentina", "Armenia": "Armenia", "Australia": "Australia", "Austria": "Austria", "Azerbaijan": "Azerbaijan", "Bahrain": "Bahrain", + "Bahamas": "Bahamas", "Bangladesh": "Bangladesh", + "Barbados": "Barbados", "Belarus": "Belarus", "Belgium": "Belgium", + "Belize": "Belize", "Benin": "Benin", "Bhutan": "Bhutan", "Bolivia": "Bolivia", + "Bolivia (Plurinational State of)": "Bolivia", "Bosnia and Herzegovina": "Bosnia and Herzegovina", "Botswana": "Botswana", "Brazil": "Brazil", "Brunei": "Brunei", + "Brunei Darussalam": "Brunei", "Bulgaria": "Bulgaria", "Burkina Faso": "Burkina Faso", "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", "Cambodia": "Cambodia", "Cameroon": "Cameroon", "Canada": "Canada", @@ -36,16 +43,19 @@ "Colombia": "Colombia", "Comoros": "Comoros", "Congo": "Congo", + "Cook Islands": "Cook Islands", "Costa Rica": "Costa Rica", "Croatia": "Croatia", "Cuba": "Cuba", "Cyprus": "Cyprus", "Czech Republic": "Czechia", + "Czechia": "Czechia", "C\u00f4te d'Ivoire": "Cote d'Ivoire", "Democratic People's Republic of Korea": "North Korea", "Democratic Republic of the Congo": "Democratic Republic of Congo", "Denmark": "Denmark", "Djibouti": "Djibouti", + "Dominica": "Dominica", "Dominican Republic": "Dominican Republic", "Ecuador": "Ecuador", "Egypt": "Egypt", @@ -53,7 +63,9 @@ "Equatorial Guinea": "Equatorial Guinea", "Eritrea": "Eritrea", "Estonia": "Estonia", + "Eswatini": "Eswatini", "Ethiopia": "Ethiopia", + "Fiji": "Fiji", "Finland": "Finland", "France": "France", "Gabon": "Gabon", @@ -62,9 +74,11 @@ "Germany": "Germany", "Ghana": "Ghana", "Greece": "Greece", + "Grenada": "Grenada", "Guatemala": "Guatemala", "Guinea": "Guinea", "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", "Haiti": "Haiti", "Honduras": "Honduras", "Hungary": "Hungary", @@ -76,10 +90,12 @@ "Ireland": "Ireland", "Israel": "Israel", "Italy": "Italy", + "Jamaica": "Jamaica", "Japan": "Japan", "Jordan": "Jordan", "Kazakhstan": "Kazakhstan", "Kenya": "Kenya", + "Kiribati": "Kiribati", "Kuwait": "Kuwait", "Kyrgyzstan": "Kyrgyzstan", "Lao People's Democratic Republic": "Laos", @@ -87,6 +103,7 @@ "Lebanon": "Lebanon", "Lesotho": "Lesotho", "Liberia": "Liberia", + "Libya": "Libya", "Lithuania": "Lithuania", "Luxembourg": "Luxembourg", "Madagascar": "Madagascar", @@ -95,9 +112,11 @@ "Maldives": "Maldives", "Mali": "Mali", "Malta": "Malta", + "Marshall Islands": "Marshall Islands", "Mauritania": "Mauritania", "Mauritius": "Mauritius", "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", "Moldova": "Moldova", "Monaco": "Monaco", "Mongolia": "Mongolia", @@ -106,15 +125,19 @@ "Mozambique": "Mozambique", "Myanmar": "Myanmar", "Namibia": "Namibia", + "Nauru": "Nauru", "Nepal": "Nepal", "Netherlands": "Netherlands", "New Zealand": "New Zealand", "Nicaragua": "Nicaragua", "Niger": "Niger", "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", "Norway": "Norway", "Oman": "Oman", "Pakistan": "Pakistan", + "Palau": "Palau", "Panama": "Panama", "Papua New Guinea": "Papua New Guinea", "Paraguay": "Paraguay", @@ -124,11 +147,16 @@ "Portugal": "Portugal", "Qatar": "Qatar", "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", "Reunion": "Reunion", "Romania": "Romania", "Russian Federation": "Russia", "Rwanda": "Rwanda", "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", "San Marino": "San Marino", "Sao Tome and Principe": "Sao Tome and Principe", "Saudi Arabia": "Saudi Arabia", @@ -140,31 +168,40 @@ "Singapore": "Singapore", "Slovakia": "Slovakia", "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", "Somalia": "Somalia", "South Africa": "South Africa", "South Sudan": "South Sudan", "Spain": "Spain", "Sri Lanka": "Sri Lanka", "Sudan": "Sudan", + "Suriname": "Suriname", "Swaziland": "Eswatini", "Sweden": "Sweden", "Switzerland": "Switzerland", "Syrian Arab Republic": "Syria", "Tajikistan": "Tajikistan", "Thailand": "Thailand", + "Timor-Leste": "East Timor", "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", "Tunisia": "Tunisia", "Turkey": "Turkey", "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", "Uganda": "Uganda", "Ukraine": "Ukraine", "United Arab Emirates": "United Arab Emirates", "United Kingdom": "United Kingdom", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", "United Republic of Tanzania": "Tanzania", "United States of America": "United States", "Uruguay": "Uruguay", "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", "Venezuela": "Venezuela", + "Venezuela (Bolivarian Republic of)": "Venezuela", "Viet Nam": "Vietnam", "West Bank and Gaza": "Palestine", "Yemen": "Yemen", diff --git a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml index 01a8833fe56..7725c8e1601 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -9,6 +9,7 @@ definitions: # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: + title: "World Health Organization (WHO) - Polio" update_period_days: 365 tables: @@ -20,6 +21,12 @@ tables: unit: cases display: numDecimalPlaces: 0 + afp_cases_per_million: + title: Acute Flaccid Paralysis (AFP) cases per million + description_short: Number of new cases of acute flaccid paralysis (AFP) reported in a given year per million people. + unit: cases per million + display: + numDecimalPlaces: 1 non_polio_afp_rate: title: Non-Polio acute flaccid paralysis (afp) rate description_short: The number of cases of acute flaccid paralysis (AFP) per 100,000 children aged 15 or under, not attributed to polio, that were detected and reported to the WHO. @@ -40,12 +47,24 @@ tables: unit: cases display: numDecimalPlaces: 0 + wild_poliovirus_cases_per_million: + title: Wild Poliovirus cases per million + description_short: "The number of cases of wild poliovirus detected in a given year per million people" + unit: cases per million + display: + numDecimalPlaces: 1 cvdpv_cases: title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases description_short: "The number of cases of circulating vaccine-derived poliovirus detected in a given year" unit: cases display: numDecimalPlaces: 0 + cvdpv_cases_per_million: + title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases per million + description_short: "The number of cases of circulating vaccine-derived poliovirus detected in a given year per million people" + unit: cases per million + display: + numDecimalPlaces: 1 compatibles: title: Compatibles unit: "" @@ -67,27 +86,57 @@ tables: unit: cases display: numDecimalPlaces: 0 + total_cases_per_million: + title: Total polio cases per million + description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases per million people + unit: cases per million + display: + numDecimalPlaces: 1 estimated_cases: title: Estimated polio cases description_short: The total estimated number of polio cases, using Tebbens et al. unit: cases display: numDecimalPlaces: 0 + estimated_cases_per_million: + title: Estimated polio cases per million + description_short: The total estimated number of polio cases per million people, using Tebbens et al. + unit: cases per million + display: + numDecimalPlaces: 1 cvdpv1: title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV1) cases description_short: "The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year" unit: cases display: numDecimalPlaces: 0 + cvdpv1_per_million: + title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV2) cases per million + description_short: "The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year, per million people" + unit: cases per million + display: + numDecimalPlaces: 1 cvdpv2: title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases description_short: "The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year" unit: cases display: numDecimalPlaces: 0 + cvdpv2_per_million: + title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases per million + description_short: "The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year, per million people" + unit: cases per million + display: + numDecimalPlaces: 1 cvdpv3: title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases description_short: "The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year" unit: cases display: numDecimalPlaces: 0 + cvdpv3_per_million: + title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases per million + description_short: "The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year, per million people" + unit: cases per million + display: + numDecimalPlaces: 1 diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index 961bf997244..f8c9f928127 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -4,7 +4,7 @@ from owid.catalog import Table from owid.catalog import processing as pr -from etl.data_helpers.geo import add_regions_to_table, harmonize_countries +from etl.data_helpers.geo import add_population_to_table, add_regions_to_table, harmonize_countries from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -27,7 +27,7 @@ def run(dest_dir: str) -> None: ds_historical = paths.load_dataset("polio_historical") # Load population data to calculate cases per million population ds_population = paths.load_dataset("population") - tb_population = ds_population["population"] + tb_population = ds_population["population"].reset_index() # Load fasttrack Global Polio Eradication Initiative on circulating vaccine derived polio cases snap_cvdpv = paths.load_snapshot("gpei.csv") tb_cvdpv = snap_cvdpv.read() @@ -35,7 +35,6 @@ def run(dest_dir: str) -> None: tb_cvdpv = tb_cvdpv.drop(columns=["total_cvdpv"]) # Load regions dataset. ds_regions = paths.load_dataset("regions") - # Load income groups dataset. ds_income_groups = paths.load_dataset("income_groups") @@ -45,12 +44,6 @@ def run(dest_dir: str) -> None: tb_hist = tb_hist.rename(columns={"cases": "total_cases"}) # Only need this for data prior to 2001 tb_hist = tb_hist[tb_hist["year"] < 2001] - # - # Process data. - # - tb = harmonize_countries( - df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path - ) # Remove data from before 2001. tb = remove_pre_2001_data(tb) @@ -60,6 +53,9 @@ def run(dest_dir: str) -> None: tb["total_cases"] = tb["wild_poliovirus_cases"] + tb["cvdpv_cases"] # Need to deal with overlapping years tb = pr.concat([tb, tb_hist], axis=0) + tb = harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) tb = tb.merge(tb_cvdpv, on=["country", "year"], how="left") # Add region aggregates. tb_reg = add_regions_to_table( @@ -88,7 +84,7 @@ def run(dest_dir: str) -> None: tb["estimated_cases"] = tb["total_cases"] * tb["correction_factor"] # Add polio surveillance status based on the screening and testing rates. tb = add_screening_and_testing(tb) - + tb = add_cases_per_million(tb, tb_population) tb = tb.set_index(["country", "year"], verify_integrity=True) tb.metadata.short_name = "polio" @@ -104,14 +100,27 @@ def run(dest_dir: str) -> None: ds_garden.save() -def cases_per_million(tb: Table, ds_population: Table) -> Table: +def add_cases_per_million(tb: Table, tb_population: Table) -> Table: """ - Add cases per million population for each country. + Add cases per million population for each country, for the columns concerning each type of polio cases. """ - tb = tb.reset_index() - tb = tb.merge(ds_population, on=["country", "year"], how="left") - tb["cases_per_million"] = tb["total_cases"] / tb["population"] * 1_000_000 - tb = tb.set_index(["country", "year"], verify_integrity=True) + tb_population = tb_population[["country", "year", "population"]] + tb = tb.merge(tb_population, on=["country", "year"], how="left") + + cols_to_divide = [ + "afp_cases", + "wild_poliovirus_cases", + "cvdpv_cases", + "total_cases", + "estimated_cases", + "cvdpv1", + "cvdpv2", + "cvdpv3", + ] + for col in cols_to_divide: + tb[f"{col}_per_million"] = tb[col] / tb["population"] * 1_000_000 + + tb = tb.drop(columns=["population"]) return tb From 11ae1c637f9350b717d4643c971c8d0833fdb462 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 11:32:09 +0100 Subject: [PATCH 20/35] adding year of last wild polio case --- .../health/2024-04-12/polio_free_countries.py | 34 ++++++++++++------- .../data/garden/who/2024-04-08/polio.meta.yml | 6 ++-- etl/steps/data/garden/who/2024-04-08/polio.py | 4 +-- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index 07ea6152a20..f67c3900569 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: tb = ds_meadow["polio_free_countries"].reset_index() # Assign polio free countries. - tb = define_polio_free(tb) + tb = define_polio_free_new(tb, latest_year=LATEST_YEAR) # Set an index and sort. tb = tb.format() @@ -34,27 +34,35 @@ def run(dest_dir: str) -> None: ds_garden.save() -def define_polio_free(tb: Table) -> Table: +def define_polio_free_new(tb: Table, latest_year: int) -> Table: """Define the polio free countries table.""" + # Make a copy of the DataFrame to avoid modifying the original DataFrame + tb = tb.copy() # Clean the data tb["year"] = tb["year"].astype(str) - # Drop countries with missing values - tb = tb[tb["year"] != "data not available"] - # Change pre 1985 to 1984 - tb["year"] = tb["year"].replace("pre 1985", "1984") - # Change ongoing to LATEST_YEAR + 1 - tb["year"] = tb["year"].replace("ongoing", LATEST_YEAR + 1) - tb["year"] = tb["year"].astype(int) - years = list(range(1910, LATEST_YEAR + 1)) - tb_prod = Table(product(tb["country"], years), columns=["country", "year"]) + # Drop countries with missing values explicitly copying to avoid setting on a slice warning + tb = tb[tb["year"] != "data not available"].copy() + + # Change 'pre 1985' to 1984 and 'ongoing' to LATEST_YEAR + 1 + tb.loc[tb["year"] == "pre 1985", "year"] = "1984" + tb.loc[tb["year"] == "ongoing", "year"] = str(latest_year + 1) + tb["year"] = tb["year"].astype(int) + # Rename year to latest year + tb = tb.rename(columns={"year": "latest_year_wild_polio_case"}) + # Create a product of all countries and all years from 1910 to LATEST_YEAR + tb_prod = Table(product(tb["country"].unique(), range(1910, latest_year + 1)), columns=["country", "year"]) + + # Define polio status based on the year comparison tb_prod["status"] = tb_prod.apply( lambda row: "Endemic" - if row["year"] < tb[tb["country"] == row["country"]]["year"].values[0] + if row["year"] < tb[tb["country"] == row["country"]]["latest_year_wild_polio_case"].min() else "Polio-free (not certified)", axis=1, ) - return tb_prod + tb = tb.merge(tb_prod, on="country") + + return tb diff --git a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml index 7725c8e1601..56a40630476 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -94,13 +94,15 @@ tables: numDecimalPlaces: 1 estimated_cases: title: Estimated polio cases - description_short: The total estimated number of polio cases, using Tebbens et al. + description_short: The total estimated number of polio cases. + description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al (2010). unit: cases display: numDecimalPlaces: 0 estimated_cases_per_million: title: Estimated polio cases per million - description_short: The total estimated number of polio cases per million people, using Tebbens et al. + description_short: The total estimated number of polio cases per million people. + description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al (2010). unit: cases per million display: numDecimalPlaces: 1 diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index f8c9f928127..e9a63788dff 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -4,7 +4,7 @@ from owid.catalog import Table from owid.catalog import processing as pr -from etl.data_helpers.geo import add_population_to_table, add_regions_to_table, harmonize_countries +from etl.data_helpers.geo import add_regions_to_table, harmonize_countries from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -52,7 +52,7 @@ def run(dest_dir: str) -> None: # Add total cases tb["total_cases"] = tb["wild_poliovirus_cases"] + tb["cvdpv_cases"] # Need to deal with overlapping years - tb = pr.concat([tb, tb_hist], axis=0) + tb = pr.concat([tb_hist, tb], axis=0) tb = harmonize_countries( df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path ) From 104728ac76066e009e6965132e4bbb0aa07b52c2 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 12:42:37 +0100 Subject: [PATCH 21/35] wip --- ...on => polio_free_countries.countries.json} | 54 ++++-- .../health/2024-04-12/polio_free_countries.py | 29 +++- .../2024-04-12/polio_free_countries.yml | 161 +----------------- 3 files changed, 63 insertions(+), 181 deletions(-) rename etl/steps/data/garden/health/2024-04-12/{polio_free_countries.json => polio_free_countries.countries.json} (77%) diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.json b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json similarity index 77% rename from etl/steps/data/garden/health/2024-04-12/polio_free_countries.json rename to etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json index 2b2b53734b2..b808d4198a8 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.json +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json @@ -13,13 +13,11 @@ "Bahamas": "Bahamas", "Bahrain": "Bahrain", "Bangladesh": "Bangladesh", - "Barbados": "Barbados", "Belarus": "Belarus", "Belgium": "Belgium", "Belize": "Belize", "Benin": "Benin", "Bhutan": "Bhutan", - "Bolivia (Plurinational State of)": "Bolivia", "Bosnia and Herzegovina": "Bosnia and Herzegovina", "Botswana": "Botswana", "Brazil": "Brazil", @@ -44,9 +42,6 @@ "Cuba": "Cuba", "Cyprus": "Cyprus", "Czech Republic": "Czechia", - "C\u00f4te d\u2019Ivoire": "Cote d'Ivoire", - "Democratic People's Republic of Korea": "North Korea", - "Democratic Republic of The Congo": "Democratic Republic of Congo", "Denmark": "Denmark", "Djibouti": "Djibouti", "Dominica": "Dominica", @@ -57,7 +52,6 @@ "Equatorial Guinea": "Equatorial Guinea", "Eritrea": "Eritrea", "Estonia": "Estonia", - "Eswatini": "Eswatini", "Ethiopia": "Ethiopia", "Fiji": "Fiji", "Finland": "Finland", @@ -72,6 +66,7 @@ "Guatemala": "Guatemala", "Guinea": "Guinea", "Guinea-Bissau": "Guinea-Bissau", + "Guam": "Guam", "Guyana": "Guyana", "Haiti": "Haiti", "Honduras": "Honduras", @@ -79,7 +74,6 @@ "Iceland": "Iceland", "India": "India", "Indonesia": "Indonesia", - "Iran (Islamic Republic of)": "Iran", "Iraq": "Iraq", "Ireland": "Ireland", "Israel": "Israel", @@ -108,10 +102,8 @@ "Mauritania": "Mauritania", "Mauritius": "Mauritius", "Mexico": "Mexico", - "Micronesia (Federated States of)": "Micronesia (country)", "Monaco": "Monaco", "Mongolia": "Mongolia", - "Montenegro": "Montenegro", "Morocco": "Morocco", "Mozambique": "Mozambique", "Myanmar": "Myanmar", @@ -149,7 +141,6 @@ "Sao Tome and Principe": "Sao Tome and Principe", "Saudi Arabia": "Saudi Arabia", "Senegal": "Senegal", - "Serbia": "Serbia", "Seychelles": "Seychelles", "Sierra Leone": "Sierra Leone", "Singapore": "Singapore", @@ -158,7 +149,6 @@ "Solomon Islands": "Solomon Islands", "Somalia": "Somalia", "South Africa": "South Africa", - "South Sudan": "South Sudan", "Spain": "Spain", "Sri Lanka": "Sri Lanka", "Sudan": "Sudan", @@ -168,7 +158,6 @@ "Syrian Arab Republic": "Syria", "Tajikistan": "Tajikistan", "Thailand": "Thailand", - "Timor-Leste": "East Timor", "Togo": "Togo", "Tonga": "Tonga", "Trinidad and Tobago": "Trinidad and Tobago", @@ -184,13 +173,46 @@ "Uruguay": "Uruguay", "Uzbekistan": "Uzbekistan", "Vanuatu": "Vanuatu", - "Venezuela (Bolivarian Republic of)": "Venezuela", "Viet Nam": "Vietnam", "Yemen": "Yemen", "Zambia": "Zambia", "Zimbabwe": "Zimbabwe", - "Lao People's Democratic Republic ": "Laos", "Libyan Arab Jamahiriya": "Libya", - "Republic of North Macedonia": "North Macedonia", - "T\u00fcrkiye ": "Turkey" + "American Samoa": "American Samoa", + "Anguilla": "Anguilla", + "Bermuda": "Bermuda", + "Bolivia": "Bolivia", + "British Virgin Islands": "British Virgin Islands", + "Cayman Islands": "Cayman Islands", + "East Timor": "East Timor", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Guadeloupe": "Guadeloupe", + "Iran": "Iran", + "Martinique": "Martinique", + "Montserrat": "Montserrat", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "Puerto Rico": "Puerto Rico", + "Reunion": "Reunion", + "Saint Helena": "Saint Helena", + "Swaziland": "Eswatini", + "Tokelau": "Tokelau", + "Turkey": "Turkey", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Venezuela": "Venezuela", + "Wallis and Futuna": "Wallis and Futuna", + "West Bank and Gaza": "Palestine", + "Barabados": "Barbados", + "Cote d\u2019Ivoire": "Cote d'Ivoire", + "Democratic People\u2019s Rep. of Korea": "North Korea", + "Federated States of Micronesia": "Micronesia (country)", + "Former Yugoslav Rep. of Macedonia": "North Macedonia", + "Hong Kong, SAR": "Hong Kong", + "Lao People\u2019s Democratic Republic": "Laos", + "Macao, SAR": "Macao", + "Mariana Islands": "Northern Mariana Islands", + "Palestine N.A.": "Palestine", + "US Virgin Islands": "United States Virgin Islands", + "Democratic Republic of the Congo": "Democratic Republic of Congo" } \ No newline at end of file diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index f67c3900569..d81182a31e2 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -4,6 +4,7 @@ from owid.catalog import Table +from etl.data_helpers.geo import harmonize_countries from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -18,19 +19,36 @@ def run(dest_dir: str) -> None: # # Load meadow dataset and read its main table. ds_meadow = paths.load_dataset("polio_free_countries") - ds_region_status = paths.load_dataset(short_name="polio_status", channel="meadow") tb = ds_meadow["polio_free_countries"].reset_index() + ##### Temporary fix - we remove West Bank and Gaza as there is both data for West Bank and Gaza _and_ Palestine N.A (national authority). + ##### I'm not sure how we should treat these but for now I will just stick with the entity that has the latest value, so Palestine N.A. + + tb = tb[tb["country"] != "West Bank and Gaza"] + ##### There are also two values for Somalia, I will drop the least recent one + tb = tb[~((tb["country"] == "Somalia") & (tb["year"] == 2000))] + + # Adding the regional status to the polio free countries table + ds_region_status = paths.load_dataset(short_name="polio_status", channel="meadow") + tb_region_status = ds_region_status["polio_status"].reset_index() + + # Adding regions data + # ds_regions = paths.load_dataset("regions") # Assign polio free countries. - tb = define_polio_free_new(tb, latest_year=LATEST_YEAR) + + tb = harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb, tb_status = define_polio_free_new(tb, latest_year=LATEST_YEAR) # Set an index and sort. tb = tb.format() + tb = tb.set_index(["country"]).sort_index() + tb_status = tb_status.format() # # Save outputs. # # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden = create_dataset(dest_dir, tables=[tb, tb_status], check_variables_metadata=True) ds_garden.save() @@ -52,6 +70,7 @@ def define_polio_free_new(tb: Table, latest_year: int) -> Table: tb["year"] = tb["year"].astype(int) # Rename year to latest year tb = tb.rename(columns={"year": "latest_year_wild_polio_case"}) + tb["year"] = latest_year # Create a product of all countries and all years from 1910 to LATEST_YEAR tb_prod = Table(product(tb["country"].unique(), range(1910, latest_year + 1)), columns=["country", "year"]) @@ -63,6 +82,4 @@ def define_polio_free_new(tb: Table, latest_year: int) -> Table: axis=1, ) - tb = tb.merge(tb_prod, on="country") - - return tb + return tb, tb_prod diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml index 4c74ee6c2e1..5e011a9e463 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml @@ -1,9 +1,9 @@ definitions: common: presentation: - attribution_short: GODT topic_tags: - Global Health + - Eradication of Diseases processing_level: minor unit: "" short_unit: "" @@ -14,162 +14,5 @@ dataset: update_period_days: 365 tables: - organ_donation_and_transplantation: + polio_free_countries: variables: - n_organ_donors: - title: Total number of actual deceased organ donors - description_short: |- - Actual donors are those individuals from whom one or more organs have been recovered for the purpose of transplantation. - n_donors_after_brain_death: - title: Number of actual donors after brain death - description_short: Deceased organ donors in whom death has been determined by neurological criteria. - n_donors_after_circulatory_death: - title: Number of actual donors after circulatory death - description_short: Deceased organ donors in whom death has been determined by circulatory criteria. - n_utilized_organ_donors: - title: Total number of utilized deceased organ donors - n_utilized_donors_after_brain_death: - title: Number of utilized donors after brain death - description_short: Utilized deceased organ donors in whom death has been determined by neurological criteria. - n_utilized_donors_after_circulatory_death: - title: Number of utilized donors after circulatory death - description_short: Utilized deceased organ donors in whom death has been determined by circulatory criteria. - n_kidney_transplantation_from_deceased_persons: - title: Number of kidney transplantations from deceased persons - n_kidney_transplantation_from_living_persons: - title: Number of kidney transplantations from living persons - n_kidney_transplantation: - title: Total number of kidney transplantations - n_liver_transplantation_from_deceased_persons: - title: Number of liver transplantations from deceased persons - n_liver_transplantation_from_living_persons: - title: Number of liver transplantations from living persons - n_domino_liver_transplantation: - title: Number of domino liver transplantations - description_short: |- - A domino liver transplantation is a surgical procedure in which a liver from a patient with a metabolic disorder is transplanted into a recipient, while the liver of the recipient is transplanted into another recipient. - n_liver_transplantation: - title: Total number of liver transplantations - n_heart_transplantation: - title: Total number of heart transplantations - n_lung_transplantation_from_deceased_persons: - title: Number of lung transplantations from deceased persons - n_lung_transplantation_from_living_persons: - title: Number of lung transplantations from living persons - n_lung_transplantation: - title: Total number of lung transplantations - n_pancreas_transplantation: - title: Total number of pancreas transplantations - n_kidney_pancreas_transplantation: - title: Total number of kidney-pancreas transplantations - n_small_bowel_transplantation: - title: Total number of small bowel transplantations - # Add indicators per million people. - n_organ_donors_per_million_people: - title: Total number of actual deceased organ donors per million people - description_short: |- - Actual donors are those individuals from whom one or more organs have been recovered for the purpose of transplantation. - display: - numDecimalPlaces: 1 - processing_level: major - n_donors_after_brain_death_per_million_people: - title: Number of actual donors after brain death per million people - description_short: Deceased organ donors in whom death has been determined by neurological criteria. - processing_level: major - display: - numDecimalPlaces: 1 - n_donors_after_circulatory_death_per_million_people: - title: Number of actual donors after circulatory death per million people - description_short: Deceased organ donors in whom death has been determined by circulatory criteria. - processing_level: major - display: - numDecimalPlaces: 1 - n_utilized_organ_donors_per_million_people: - title: Total number of utilized deceased organ donors per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_utilized_donors_after_brain_death_per_million_people: - title: Number of utilized donors after brain death per million people - description_short: Utilized deceased organ donors in whom death has been determined by neurological criteria. - processing_level: major - display: - numDecimalPlaces: 1 - n_utilized_donors_after_circulatory_death_per_million_people: - title: Number of utilized donors after circulatory death per million people - description_short: Utilized deceased organ donors in whom death has been determined by circulatory criteria. - processing_level: major - display: - numDecimalPlaces: 1 - n_kidney_transplantation_from_deceased_persons_per_million_people: - title: Number of kidney transplantations from deceased persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_kidney_transplantation_from_living_persons_per_million_people: - title: Number of kidney transplantations from living persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_kidney_transplantation_per_million_people: - title: Total number of kidney transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_liver_transplantation_from_deceased_persons_per_million_people: - title: Number of liver transplantations from deceased persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_liver_transplantation_from_living_persons_per_million_people: - title: Number of liver transplantations from living persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_domino_liver_transplantation_per_million_people: - title: Number of domino liver transplantations per million people - description_short: |- - A domino liver transplantation is a surgical procedure in which a liver from a patient with a metabolic disorder is transplanted into a recipient, while the liver of the recipient is transplanted into another recipient. - processing_level: major - display: - numDecimalPlaces: 1 - n_liver_transplantation_per_million_people: - title: Total number of liver transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_heart_transplantation_per_million_people: - title: Total number of heart transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_lung_transplantation_from_deceased_persons_per_million_people: - title: Number of lung transplantations from deceased persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_lung_transplantation_from_living_persons_per_million_people: - title: Number of lung transplantations from living persons per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_lung_transplantation_per_million_people: - title: Total number of lung transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_pancreas_transplantation_per_million_people: - title: Total number of pancreas transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_kidney_pancreas_transplantation_per_million_people: - title: Total number of kidney-pancreas transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 - n_small_bowel_transplantation_per_million_people: - title: Total number of small bowel transplantations per million people - processing_level: major - display: - numDecimalPlaces: 1 From e8f2e3e5a78b7a45bda46c78566daf291a0e0ec0 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 12:45:08 +0100 Subject: [PATCH 22/35] wip --- .../data/garden/health/2024-04-12/polio_free_countries.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index d81182a31e2..fc3937b813f 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -26,7 +26,7 @@ def run(dest_dir: str) -> None: tb = tb[tb["country"] != "West Bank and Gaza"] ##### There are also two values for Somalia, I will drop the least recent one - tb = tb[~((tb["country"] == "Somalia") & (tb["year"] == 2000))] + tb = tb[~((tb["country"] == "Somalia") & (tb["year"] == "2000"))] # Adding the regional status to the polio free countries table ds_region_status = paths.load_dataset(short_name="polio_status", channel="meadow") @@ -41,7 +41,7 @@ def run(dest_dir: str) -> None: tb, tb_status = define_polio_free_new(tb, latest_year=LATEST_YEAR) # Set an index and sort. tb = tb.format() - tb = tb.set_index(["country"]).sort_index() + # tb = tb.set_index(["country"]).sort_index() tb_status = tb_status.format() # From f97c9579cde9562e4b1eec8b7f593e0febe7a1ae Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 12:46:28 +0100 Subject: [PATCH 23/35] updates --- dag/health.yml | 1 + etl/steps/data/garden/health/2024-04-12/polio_free_countries.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dag/health.yml b/dag/health.yml index b2d93aaaf55..3eb88c5fae3 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -543,5 +543,6 @@ steps: data://garden/health/2024-04-12/polio_free_countries: - data://meadow/health/2024-04-12/polio_status - data://meadow/health/2024-04-12/polio_free_countries + - data://garden/regions/2023-01-01/regions data://meadow/health/2024-04-12/polio_free_countries: - snapshot://health/2024-04-12/polio_free_countries.csv diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index fc3937b813f..bd207351e26 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -43,7 +43,7 @@ def run(dest_dir: str) -> None: tb = tb.format() # tb = tb.set_index(["country"]).sort_index() tb_status = tb_status.format() - + tb_status.metadata.short_name = "polio_free_countries_status" # # Save outputs. # From 5e6d39f21f8dd491762ae69471e8373cbe83378e Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 13:48:32 +0100 Subject: [PATCH 24/35] adding who regional certification --- dag/health.yml | 1 + .../health/2024-04-12/polio_free_countries.py | 35 +++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/dag/health.yml b/dag/health.yml index 3eb88c5fae3..6fd4898811d 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -544,5 +544,6 @@ steps: - data://meadow/health/2024-04-12/polio_status - data://meadow/health/2024-04-12/polio_free_countries - data://garden/regions/2023-01-01/regions + - data://meadow/who/2024-04-09/polio_historical data://meadow/health/2024-04-12/polio_free_countries: - snapshot://health/2024-04-12/polio_free_countries.csv diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index bd207351e26..67896fd061b 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -2,9 +2,9 @@ from itertools import product -from owid.catalog import Table +from owid.catalog import Dataset, Table -from etl.data_helpers.geo import harmonize_countries +from etl.data_helpers.geo import harmonize_countries, list_members_of_region from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -32,6 +32,9 @@ def run(dest_dir: str) -> None: ds_region_status = paths.load_dataset(short_name="polio_status", channel="meadow") tb_region_status = ds_region_status["polio_status"].reset_index() + ds_regions = paths.load_dataset("regions") + tb_regions = ds_regions["regions"].reset_index() + who_regions = tb_regions[tb_regions["defined_by"] == "who"] # Adding regions data # ds_regions = paths.load_dataset("regions") # Assign polio free countries. @@ -39,6 +42,8 @@ def run(dest_dir: str) -> None: tb = harmonize_countries(df=tb, countries_file=paths.country_mapping_path) tb, tb_status = define_polio_free_new(tb, latest_year=LATEST_YEAR) + + tb_status = add_polio_region_certification(tb_status, tb_region_status, who_regions, ds_regions) # Set an index and sort. tb = tb.format() # tb = tb.set_index(["country"]).sort_index() @@ -52,6 +57,32 @@ def run(dest_dir: str) -> None: ds_garden.save() +def add_polio_region_certification( + tb_status: Table, tb_region_status: Table, who_regions: Table, ds_regions: Dataset +) -> Table: + # Append "(WHO)" suffix to the "who_region" to match the region names in the who_regions table + tb_region_status["who_region"] = tb_region_status["who_region"].astype(str) + " (WHO)" + + # Correct mapping of regions to status updates by ensuring 'region' matches the modified 'who_region' entries + for region in who_regions["name"]: + # Generate country list for the current region + country_list = list_members_of_region(region=region, ds_regions=ds_regions) + if not country_list: + raise ValueError(f"No countries found for region {region}") + + # Find the year of certification for the current region + year_certified = tb_region_status.loc[tb_region_status["who_region"] == region, "year_certified_polio_free"] + + # Check if there is a valid year of certification + if not year_certified.empty and year_certified.notna().all(): + # Set the status for all relevant countries and years + tb_status.loc[ + (tb_status["country"].isin(country_list)) & (tb_status["year"] >= int(year_certified)), "status" + ] = "WHO Region certified polio-free" + + return tb_status + + def define_polio_free_new(tb: Table, latest_year: int) -> Table: """Define the polio free countries table.""" # Make a copy of the DataFrame to avoid modifying the original DataFrame From 83001e9d423032143bf6bec180e9c953f9a32711 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 14:17:28 +0100 Subject: [PATCH 25/35] adding grapher step and ironing out some wrinkles --- dag/health.yml | 7 ++-- .../2024-04-12/polio_free_countries.meta.yml | 30 +++++++++++++++ .../health/2024-04-12/polio_free_countries.py | 38 +++++++++---------- .../2024-04-12/polio_free_countries.yml | 18 --------- .../health/2024-04-12/polio_free_countries.py | 22 +++++++++++ 5 files changed, 74 insertions(+), 41 deletions(-) create mode 100644 etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml delete mode 100644 etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml create mode 100644 etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py diff --git a/dag/health.yml b/dag/health.yml index 6fd4898811d..320697744bb 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -540,10 +540,11 @@ steps: - snapshot://health/2024-04-12/polio_status.csv # Polio free countries + data://meadow/health/2024-04-12/polio_free_countries: + - snapshot://health/2024-04-12/polio_free_countries.csv data://garden/health/2024-04-12/polio_free_countries: - data://meadow/health/2024-04-12/polio_status - data://meadow/health/2024-04-12/polio_free_countries - data://garden/regions/2023-01-01/regions - - data://meadow/who/2024-04-09/polio_historical - data://meadow/health/2024-04-12/polio_free_countries: - - snapshot://health/2024-04-12/polio_free_countries.csv + data://grapher/health/2024-04-12/polio_free_countries: + - data://garden/health/2024-04-12/polio_free_countries diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml new file mode 100644 index 00000000000..18c6ab5715e --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml @@ -0,0 +1,30 @@ +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases + processing_level: minor + unit: "" + short_unit: "" + display: + numDecimalPlaces: 0 + +dataset: + update_period_days: 365 + +tables: + polio_free_countries: + variables: + latest_year_wild_polio_case: + title: Latest year of wild polio case + description_short: The most recent year in which a case of wild poliovirus was detected in a country. + unit: "" + display: + numDecimalPlaces: 0 + status: + title: Polio-free status + description_short: The status of a country in terms of polio eradication. + unit: "" + display: + numDecimalPlaces: 0 diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index 67896fd061b..726ffe0e03e 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -3,6 +3,7 @@ from itertools import product from owid.catalog import Dataset, Table +from owid.catalog import processing as pr from etl.data_helpers.geo import harmonize_countries, list_members_of_region from etl.helpers import PathFinder, create_dataset @@ -28,37 +29,33 @@ def run(dest_dir: str) -> None: ##### There are also two values for Somalia, I will drop the least recent one tb = tb[~((tb["country"] == "Somalia") & (tb["year"] == "2000"))] - # Adding the regional status to the polio free countries table + # Loading the polio status data for WHO regions ds_region_status = paths.load_dataset(short_name="polio_status", channel="meadow") tb_region_status = ds_region_status["polio_status"].reset_index() + # Loading in the regions table so we know which countries are in each WHO region ds_regions = paths.load_dataset("regions") tb_regions = ds_regions["regions"].reset_index() - who_regions = tb_regions[tb_regions["defined_by"] == "who"] - # Adding regions data - # ds_regions = paths.load_dataset("regions") - # Assign polio free countries. + who_regions = tb_regions[(tb_regions["defined_by"] == "who") & (tb_regions["region_type"] == "aggregate")] tb = harmonize_countries(df=tb, countries_file=paths.country_mapping_path) - tb, tb_status = define_polio_free_new(tb, latest_year=LATEST_YEAR) + # Assign polio free countries. + tb = define_polio_free_new(tb, latest_year=LATEST_YEAR) - tb_status = add_polio_region_certification(tb_status, tb_region_status, who_regions, ds_regions) + tb = add_polio_region_certification(tb, tb_region_status, who_regions, ds_regions) # Set an index and sort. tb = tb.format() - # tb = tb.set_index(["country"]).sort_index() - tb_status = tb_status.format() - tb_status.metadata.short_name = "polio_free_countries_status" # # Save outputs. # # Create a new garden dataset. - ds_garden = create_dataset(dest_dir, tables=[tb, tb_status], check_variables_metadata=True) + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) ds_garden.save() def add_polio_region_certification( - tb_status: Table, tb_region_status: Table, who_regions: Table, ds_regions: Dataset + tb: Table, tb_region_status: Table, who_regions: Table, ds_regions: Dataset ) -> Table: # Append "(WHO)" suffix to the "who_region" to match the region names in the who_regions table tb_region_status["who_region"] = tb_region_status["who_region"].astype(str) + " (WHO)" @@ -76,34 +73,33 @@ def add_polio_region_certification( # Check if there is a valid year of certification if not year_certified.empty and year_certified.notna().all(): # Set the status for all relevant countries and years - tb_status.loc[ - (tb_status["country"].isin(country_list)) & (tb_status["year"] >= int(year_certified)), "status" + tb.loc[ + (tb["country"].isin(country_list)) & (tb["year"] >= int(year_certified)), "status" ] = "WHO Region certified polio-free" - return tb_status + return tb def define_polio_free_new(tb: Table, latest_year: int) -> Table: """Define the polio free countries table.""" - # Make a copy of the DataFrame to avoid modifying the original DataFrame - tb = tb.copy() # Clean the data tb["year"] = tb["year"].astype(str) # Drop countries with missing values explicitly copying to avoid setting on a slice warning - tb = tb[tb["year"] != "data not available"].copy() + tb = tb[tb["year"] != "data not available"] # Change 'pre 1985' to 1984 and 'ongoing' to LATEST_YEAR + 1 tb.loc[tb["year"] == "pre 1985", "year"] = "1984" tb.loc[tb["year"] == "ongoing", "year"] = str(latest_year + 1) - tb["year"] = tb["year"].astype(int) + tb["year"] = tb["year"].astype(int).copy() # Rename year to latest year tb = tb.rename(columns={"year": "latest_year_wild_polio_case"}) tb["year"] = latest_year # Create a product of all countries and all years from 1910 to LATEST_YEAR tb_prod = Table(product(tb["country"].unique(), range(1910, latest_year + 1)), columns=["country", "year"]) + tb_prod = tb_prod.copy_metadata(from_table=tb) # Define polio status based on the year comparison tb_prod["status"] = tb_prod.apply( @@ -112,5 +108,7 @@ def define_polio_free_new(tb: Table, latest_year: int) -> Table: else "Polio-free (not certified)", axis=1, ) + # Merge the two tables + tb = pr.merge(tb, tb_prod, on=["country", "year"], how="right") - return tb, tb_prod + return tb diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml deleted file mode 100644 index 5e011a9e463..00000000000 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.yml +++ /dev/null @@ -1,18 +0,0 @@ -definitions: - common: - presentation: - topic_tags: - - Global Health - - Eradication of Diseases - processing_level: minor - unit: "" - short_unit: "" - display: - numDecimalPlaces: 0 - -dataset: - update_period_days: 365 - -tables: - polio_free_countries: - variables: diff --git a/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py b/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..5fc574f0ff5 --- /dev/null +++ b/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,22 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("polio_free_countries") + tb = ds_garden["polio_free_countries"] + tb_status = ds_garden["polio_countries_status"] + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb, tb_status], check_variables_metadata=True) + ds_grapher.save() From 898aab4733e8739b939bb6794c8dacaa43c8f0ce Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 14:23:18 +0100 Subject: [PATCH 26/35] fixing some warnings --- .../garden/health/2024-04-12/polio_free_countries.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index 726ffe0e03e..4d394f21f51 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -41,7 +41,7 @@ def run(dest_dir: str) -> None: tb = harmonize_countries(df=tb, countries_file=paths.country_mapping_path) # Assign polio free countries. - tb = define_polio_free_new(tb, latest_year=LATEST_YEAR) + tb = define_polio_free(tb, latest_year=LATEST_YEAR) tb = add_polio_region_certification(tb, tb_region_status, who_regions, ds_regions) # Set an index and sort. @@ -72,22 +72,23 @@ def add_polio_region_certification( # Check if there is a valid year of certification if not year_certified.empty and year_certified.notna().all(): + year_certified_int = int(year_certified.iloc[0]) # Set the status for all relevant countries and years tb.loc[ - (tb["country"].isin(country_list)) & (tb["year"] >= int(year_certified)), "status" + (tb["country"].isin(country_list)) & (tb["year"] >= year_certified_int), "status" ] = "WHO Region certified polio-free" return tb -def define_polio_free_new(tb: Table, latest_year: int) -> Table: +def define_polio_free(tb: Table, latest_year: int) -> Table: """Define the polio free countries table.""" - + tb = tb.copy() # Clean the data tb["year"] = tb["year"].astype(str) # Drop countries with missing values explicitly copying to avoid setting on a slice warning - tb = tb[tb["year"] != "data not available"] + tb = tb[tb["year"] != "data not available"].copy() # Change 'pre 1985' to 1984 and 'ongoing' to LATEST_YEAR + 1 tb.loc[tb["year"] == "pre 1985", "year"] = "1984" From c7fd009f103cc5ef7d55d42f69f22a62fce91bdd Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 14:28:45 +0100 Subject: [PATCH 27/35] getting it going on grapher --- .../data/garden/health/2024-04-12/polio_free_countries.py | 3 ++- .../data/grapher/health/2024-04-12/polio_free_countries.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index 4d394f21f51..fbab1cc3da6 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -111,5 +111,6 @@ def define_polio_free(tb: Table, latest_year: int) -> Table: ) # Merge the two tables tb = pr.merge(tb, tb_prod, on=["country", "year"], how="right") - + # Issues with status not having origins or source, not sure this is the best way to solve + tb["status"] = tb["status"].copy_metadata(tb["latest_year_wild_polio_case"]) return tb diff --git a/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py b/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py index 5fc574f0ff5..b4ae4f6b3b0 100644 --- a/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py @@ -13,10 +13,9 @@ def run(dest_dir: str) -> None: # Load garden dataset and read its main table. ds_garden = paths.load_dataset("polio_free_countries") tb = ds_garden["polio_free_countries"] - tb_status = ds_garden["polio_countries_status"] # # Save outputs. # # Create a new grapher dataset. - ds_grapher = create_dataset(dest_dir, tables=[tb, tb_status], check_variables_metadata=True) + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) ds_grapher.save() From 6e14d8c608771342dc7550a416107f6f3711ebd8 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 14:31:01 +0100 Subject: [PATCH 28/35] changing dataset titles --- etl/steps/data/garden/who/2024-04-08/polio.meta.yml | 1 - snapshots/health/2024-04-12/polio_free_countries.csv.dvc | 2 +- snapshots/who/2024-04-08/polio_afp.csv.dvc | 3 +-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml index 56a40630476..9f49a716129 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -9,7 +9,6 @@ definitions: # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: - title: "World Health Organization (WHO) - Polio" update_period_days: 365 tables: diff --git a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc index 6b8ea00bbcf..f206d0ecbdc 100644 --- a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc +++ b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc @@ -3,7 +3,7 @@ meta: origin: # Data product / Snapshot - title: Polio-Free Countries + title: Polio-Free Countries - GPEI date_published: "2016" # Citation diff --git a/snapshots/who/2024-04-08/polio_afp.csv.dvc b/snapshots/who/2024-04-08/polio_afp.csv.dvc index fa55d155995..48fa7acff58 100644 --- a/snapshots/who/2024-04-08/polio_afp.csv.dvc +++ b/snapshots/who/2024-04-08/polio_afp.csv.dvc @@ -3,7 +3,7 @@ meta: origin: # Data product / Snapshot - title: World Health Organization - Acute Flaccid Paralysis + title: Polio cases - World Health Organization date_published: "2024-04-04" # Citation @@ -11,7 +11,6 @@ meta: citation_full: |- World Health Organization (2024) attribution_short: WHO - # Files url_main: https://extranet.who.int/polis/public/CaseCount.aspx date_accessed: 2024-04-08 From db4a14627a75a894f42840b2fb1367a80c2d2c38 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 16:53:33 +0100 Subject: [PATCH 29/35] add low risk --- etl/steps/data/garden/who/2024-04-08/polio.py | 88 ++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index e9a63788dff..a6afb68111f 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -124,6 +124,91 @@ def add_cases_per_million(tb: Table, tb_population: Table) -> Table: return tb +def identify_low_risk_countries(tb: Table) -> Table: + # Identify low-risk countries (where the surveillance status can be disregarded) + # High risk entities are those identified in the table on page 48 in this document: https://polioeradication.org/wp-content/uploads/2022/04/GPSAP-2022-2024-EN.pdf + higher_risk_entities = [ + "Chad", + "Democratic Republic of Congo", + "Ethiopia", + "Niger", + "Nigeria", + "Afghanistan", + "Pakistan", + "Somalia", + "Angola", + "Burkina Faso", + "Cameroon", + "Central African Republic", + "Guinea", + "Kenya", + "Mali", + "South Sudan", + "Yemen", + "Benin", + "Cote d'Ivoire", + "Equatorial Guinea", + "Guinea-Bissau", + "Madagascar", + "Mozambique", + "Togo", + "Iraq", + "Sudan", + "Syria", + "Myanmar", + "Papua New Guinea", + "Philippines", + "Burundi", + "Congo", + "Gabon", + "Gambia", + "Ghana", + "Liberia", + "Senegal", + "Sierra Leone", + "Uganda", + "Zambia", + "Djibouti", + "Egypt", + "Iran", + "Libya", + "Tajikistan", + "Ukraine", + "Indonesia", + "Nepal", + "Haiti", + "Laos", + "China", + "Eritrea", + "Malawi", + "Mauritania", + "Namibia", + "Rwanda", + "Tanzania", + "Zimbabwe", + "Lebanon", + "Bangladesh", + "India", + "East Timor", + "Bolivia", + "Cambodia", + "Malaysia", + ] + + difference = [item for item in higher_risk_entities if item not in tb["country"].unique()] + assert difference == [], f"Entities in the high-risk list that are not in the dataset: {difference}" + + # Define the condition for which countries are not in high-risk entities + not_high_risk = ~tb["country"].isin(higher_risk_entities) + + # Define the condition for screening year + is_screening_year = tb["year"] == SCREENING_YEAR + + # Combine conditions and update 'polio_surveillance_status' for matching rows + tb.loc[not_high_risk & is_screening_year, "polio_surveillance_status"] = "Low risk" + return tb + + def add_screening_and_testing(tb: Table, year=SCREENING_YEAR) -> Table: """ Adds the polio surveillance status based on the screening and testing rates. @@ -136,7 +221,6 @@ def add_screening_and_testing(tb: Table, year=SCREENING_YEAR) -> Table: Returns: - Modified table with a new column for polio surveillance status. """ - # tb["polio_surveillance_status"] = pd.NA tb.loc[ (tb["non_polio_afp_rate"] >= 2.0) & (tb["pct_adequate_stool_collection"] >= 80) @@ -155,6 +239,8 @@ def add_screening_and_testing(tb: Table, year=SCREENING_YEAR) -> Table: (tb["non_polio_afp_rate"] < 2.0) & (tb["pct_adequate_stool_collection"] < 80) & (tb["year"] == SCREENING_YEAR), "polio_surveillance_status", ] = "Inadequate screening and testing" + + tb = identify_low_risk_countries(tb) # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise tb["polio_surveillance_status"].metadata.origins = tb["non_polio_afp_rate"].metadata.origins return tb From e98a8cb5172499b210dfed233a15ac30d60a50a0 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 17:25:33 +0100 Subject: [PATCH 30/35] sorting out the surveillance variable --- .../data/garden/who/2024-04-08/polio.meta.yml | 8 ++++++ etl/steps/data/garden/who/2024-04-08/polio.py | 28 ++++++++++++++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml index 9f49a716129..decd3399126 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -5,6 +5,12 @@ definitions: topic_tags: - Global Health - Eradication of Diseases + surveillance_sort: &surveillance_sort + - Inadequate screening and testing + - Inadequate screening + - Indaqueate testing + - Adequate screening and testing + - Low risk # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ @@ -79,6 +85,8 @@ tables: description_short: "The status of polio surveillance in a given country" description_key: ["A country is considered to have adequate screening if it has a non-polio AFP rate of at least 2 per 100,000 children aged 15 or under.", "A country is considered to have adequate testing if it has a percentage adequate stool collection of at least 80%.", "Countries are labelled 'low risk' if they were considered low risk by the risk assessment carried out for the [2022 GPEI surveillance action plan](https://polioeradication.org/wp-content/uploads/2022/05/GPSAP-2022-2024-EN.pdf)"] unit: "" + type: ordinal + sort: *surveillance_sort total_cases: title: Total polio cases description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index a6afb68111f..021d68cf780 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -1,10 +1,10 @@ """Load a meadow dataset and create a garden dataset.""" import pandas as pd -from owid.catalog import Table +from owid.catalog import Dataset, Table from owid.catalog import processing as pr -from etl.data_helpers.geo import add_regions_to_table, harmonize_countries +from etl.data_helpers.geo import add_regions_to_table, harmonize_countries, list_members_of_region from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -35,6 +35,7 @@ def run(dest_dir: str) -> None: tb_cvdpv = tb_cvdpv.drop(columns=["total_cvdpv"]) # Load regions dataset. ds_regions = paths.load_dataset("regions") + tb_regions = ds_regions["regions"].reset_index() # Load income groups dataset. ds_income_groups = paths.load_dataset("income_groups") @@ -124,7 +125,19 @@ def add_cases_per_million(tb: Table, tb_population: Table) -> Table: return tb -def identify_low_risk_countries(tb: Table) -> Table: +def list_of_who_countries(tb_regions: Table, ds_regions: Dataset) -> list: + """List of countries as defined by WHO.""" + who_countries = [] + who_regions = tb_regions[tb_regions["defined_by"] == "who"] + for region in who_regions["name"]: + country_list = list_members_of_region(region=region, ds_regions=ds_regions) + if not country_list: + raise ValueError(f"No countries found for region {region}") + who_countries.extend(country_list) + return who_countries + + +def identify_low_risk_countries(tb: Table, tb_regions: Table, ds_regions: Dataset) -> Table: # Identify low-risk countries (where the surveillance status can be disregarded) # High risk entities are those identified in the table on page 48 in this document: https://polioeradication.org/wp-content/uploads/2022/04/GPSAP-2022-2024-EN.pdf higher_risk_entities = [ @@ -206,10 +219,11 @@ def identify_low_risk_countries(tb: Table) -> Table: # Combine conditions and update 'polio_surveillance_status' for matching rows tb.loc[not_high_risk & is_screening_year, "polio_surveillance_status"] = "Low risk" + return tb -def add_screening_and_testing(tb: Table, year=SCREENING_YEAR) -> Table: +def add_screening_and_testing(tb: Table, tb_regions: Dataset, ds_regions: Dataset) -> Table: """ Adds the polio surveillance status based on the screening and testing rates. For use in this chart: https://ourworldindata.org/grapher/polio-screening-and-testing @@ -221,6 +235,12 @@ def add_screening_and_testing(tb: Table, year=SCREENING_YEAR) -> Table: Returns: - Modified table with a new column for polio surveillance status. """ + # Ensuring we have all the countries in the WHO regions - even if there isn't other polio data for them + who_countries = list_of_who_countries(tb_regions, ds_regions) + who_tb = Table({"country": who_countries, "year": SCREENING_YEAR}).copy_metadata(from_table=tb) + tb = tb.merge(who_tb, on=["country", "year"], how="outer") + + # Add the polio surveillance status based on the screening and testing rates tb.loc[ (tb["non_polio_afp_rate"] >= 2.0) & (tb["pct_adequate_stool_collection"] >= 80) From 16fae493ec7508a494905fc983ed32782d014052 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 17:27:00 +0100 Subject: [PATCH 31/35] fixing missing arguments --- etl/steps/data/garden/who/2024-04-08/polio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index 021d68cf780..a0cf4705a52 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -84,7 +84,7 @@ def run(dest_dir: str) -> None: tb = add_correction_factor(tb) tb["estimated_cases"] = tb["total_cases"] * tb["correction_factor"] # Add polio surveillance status based on the screening and testing rates. - tb = add_screening_and_testing(tb) + tb = add_screening_and_testing(tb, tb_regions, ds_regions) tb = add_cases_per_million(tb, tb_population) tb = tb.set_index(["country", "year"], verify_integrity=True) tb.metadata.short_name = "polio" @@ -260,7 +260,7 @@ def add_screening_and_testing(tb: Table, tb_regions: Dataset, ds_regions: Datase "polio_surveillance_status", ] = "Inadequate screening and testing" - tb = identify_low_risk_countries(tb) + tb = identify_low_risk_countries(tb, tb_regions, ds_regions) # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise tb["polio_surveillance_status"].metadata.origins = tb["non_polio_afp_rate"].metadata.origins return tb From 8cb1453e3de90b0d0a2c86c80af0f9c6baf84423 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Wed, 17 Apr 2024 17:33:30 +0100 Subject: [PATCH 32/35] not using ordinal for now --- .../data/garden/who/2024-04-08/polio.meta.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml index decd3399126..bba061cd306 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -5,12 +5,13 @@ definitions: topic_tags: - Global Health - Eradication of Diseases - surveillance_sort: &surveillance_sort - - Inadequate screening and testing - - Inadequate screening - - Indaqueate testing - - Adequate screening and testing - - Low risk +# surveillance_sort: &surveillance_sort +# - Inadequate screening and testing +# - Inadequate screening +# - Inadequate testing +# - Adequate screening and testing +# - Low risk +# - nan # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ @@ -85,8 +86,8 @@ tables: description_short: "The status of polio surveillance in a given country" description_key: ["A country is considered to have adequate screening if it has a non-polio AFP rate of at least 2 per 100,000 children aged 15 or under.", "A country is considered to have adequate testing if it has a percentage adequate stool collection of at least 80%.", "Countries are labelled 'low risk' if they were considered low risk by the risk assessment carried out for the [2022 GPEI surveillance action plan](https://polioeradication.org/wp-content/uploads/2022/05/GPSAP-2022-2024-EN.pdf)"] unit: "" - type: ordinal - sort: *surveillance_sort + #type: ordinal + #sort: *surveillance_sort total_cases: title: Total polio cases description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases From 54bd3f298e507013c1c1edbf297d6daded00523d Mon Sep 17 00:00:00 2001 From: spoonerf Date: Thu, 18 Apr 2024 10:36:05 +0100 Subject: [PATCH 33/35] adding pablo arriagada's suggestions --- .../health/2024-04-12/polio_free_countries.countries.json | 1 - .../data/garden/health/2024-04-12/polio_free_countries.py | 6 +++--- .../data/meadow/health/2024-04-12/polio_free_countries.py | 2 +- snapshots/health/2024-04-12/polio_free_countries.csv.dvc | 7 ++++--- snapshots/health/2024-04-12/polio_status.csv.dvc | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json index b808d4198a8..8af883c624a 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json @@ -202,7 +202,6 @@ "Turks and Caicos Islands": "Turks and Caicos Islands", "Venezuela": "Venezuela", "Wallis and Futuna": "Wallis and Futuna", - "West Bank and Gaza": "Palestine", "Barabados": "Barbados", "Cote d\u2019Ivoire": "Cote d'Ivoire", "Democratic People\u2019s Rep. of Korea": "North Korea", diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py index fbab1cc3da6..f731727875c 100644 --- a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -88,13 +88,13 @@ def define_polio_free(tb: Table, latest_year: int) -> Table: tb["year"] = tb["year"].astype(str) # Drop countries with missing values explicitly copying to avoid setting on a slice warning - tb = tb[tb["year"] != "data not available"].copy() + tb = tb[tb["year"] != "data not available"] # Change 'pre 1985' to 1984 and 'ongoing' to LATEST_YEAR + 1 tb.loc[tb["year"] == "pre 1985", "year"] = "1984" - tb.loc[tb["year"] == "ongoing", "year"] = str(latest_year + 1) + tb.loc[tb["year"] == "ongoing", "year"] = str(latest_year) - tb["year"] = tb["year"].astype(int).copy() + tb["year"] = tb["year"].astype(int) # Rename year to latest year tb = tb.rename(columns={"year": "latest_year_wild_polio_case"}) tb["year"] = latest_year diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py index 87eb6597925..9d808e0ffef 100644 --- a/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py +++ b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py @@ -13,7 +13,7 @@ def run(dest_dir: str) -> None: # Retrieve snapshot and read its data. snap = paths.load_snapshot("polio_free_countries.csv") tb = snap.read() - tb = tb[["country", "year"]].set_index(["country", "year"], verify_integrity=True) + tb = tb.format() # # Save outputs. # diff --git a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc index f206d0ecbdc..489761f09f1 100644 --- a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc +++ b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc @@ -4,12 +4,13 @@ meta: origin: # Data product / Snapshot title: Polio-Free Countries - GPEI - date_published: "2016" - + description: |- + This table shows the last recorded case of indigenous wild poliovirus (WPV) by country. As data methodology and quality varied widely across regions and countries in earlier years, this table is based on the best-available sources for the years before 2000. + date_published: "2023" # Citation producer: Global Polio Eradication Initiative citation_full: |- - Global Polio Eradication Initiative (2016) + Global Polio Eradication Initiative (2023) # Files url_main: https://polioeradication.org/where-we-work/polio-free-countries/ diff --git a/snapshots/health/2024-04-12/polio_status.csv.dvc b/snapshots/health/2024-04-12/polio_status.csv.dvc index 141c9fa8e4f..6a1897c27ef 100644 --- a/snapshots/health/2024-04-12/polio_status.csv.dvc +++ b/snapshots/health/2024-04-12/polio_status.csv.dvc @@ -13,7 +13,7 @@ meta: attribution_short: GPEI # Files - url_main: https://www.transplant-observatory.org/ + url_main: https://polioeradication.org/ date_accessed: 2024-04-12 # License From fc305052c668ea65641a29561c35382786bcfeb5 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Thu, 18 Apr 2024 12:14:38 +0100 Subject: [PATCH 34/35] adding pablo rosado's suggestions --- .../data/garden/who/2024-04-08/polio.meta.yml | 41 +++++++++++-------- etl/steps/data/garden/who/2024-04-08/polio.py | 7 ++-- .../garden/who/2024-04-09/polio_historical.py | 2 +- .../data/meadow/who/2024-04-08/polio_afp.py | 2 +- .../meadow/who/2024-04-09/polio_historical.py | 6 +-- .../2024-04-12/polio_free_countries.csv.dvc | 4 +- snapshots/health/2024-04-12/polio_status.py | 5 +-- snapshots/who/2024-04-08/polio_afp.csv.dvc | 8 ++-- .../who/2024-04-09/polio_historical.xls.dvc | 11 ++--- 9 files changed, 44 insertions(+), 42 deletions(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml index bba061cd306..9bab6dc40e2 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -36,16 +36,18 @@ tables: non_polio_afp_rate: title: Non-Polio acute flaccid paralysis (afp) rate description_short: The number of cases of acute flaccid paralysis (AFP) per 100,000 children aged 15 or under, not attributed to polio, that were detected and reported to the WHO. - description_key: ["Acute flaccid paralysis (AFP) surveillance is the gold standard for detecting cases of poliomyelitis.", "At least one case of non-polio AFP should be detected annually per 100,000 population aged less than 15 years. In endemic regions, to ensure even higher sensitivity, this rate should be two per 100 000."] + description_key: + - Acute flaccid paralysis (AFP) surveillance is the gold standard for detecting cases of poliomyelitis. + - At least one case of non-polio AFP should be detected annually per 100,000 population aged less than 15 years. In endemic regions, to ensure even higher sensitivity, this rate should be two per 100 000. unit: cases per 100,000 children pct_adequate_stool_collection: title: Adequate stool collection (%) - description_short: "The share of acute flaccid paralysis (AFP) cases, where stool samples were tested for poliovirus and reported to the WHO" + description_short: The share of acute flaccid paralysis (AFP) cases, where stool samples were tested for poliovirus and reported to the WHO. unit: "%" short_unit: "%" pending: title: Pending - description_short: "The number of acute flaccid paralysis (AFP) cases pending classification by the WHO" + description_short: The number of acute flaccid paralysis (AFP) cases pending classification by the WHO. unit: cases wild_poliovirus_cases: title: Wild Poliovirus cases @@ -55,19 +57,19 @@ tables: numDecimalPlaces: 0 wild_poliovirus_cases_per_million: title: Wild Poliovirus cases per million - description_short: "The number of cases of wild poliovirus detected in a given year per million people" + description_short: The number of cases of wild poliovirus detected in a given year per million people. unit: cases per million display: numDecimalPlaces: 1 cvdpv_cases: title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases - description_short: "The number of cases of circulating vaccine-derived poliovirus detected in a given year" + description_short: The number of cases of circulating vaccine-derived poliovirus detected in a given year. unit: cases display: numDecimalPlaces: 0 cvdpv_cases_per_million: title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases per million - description_short: "The number of cases of circulating vaccine-derived poliovirus detected in a given year per million people" + description_short: The number of cases of circulating vaccine-derived poliovirus detected in a given year per million people. unit: cases per million display: numDecimalPlaces: 1 @@ -79,74 +81,77 @@ tables: unit: "" correction_factor: title: Correction factor - description_short: "Correction factor applied to the number of reported polio cases to account for underreporting" + description_short: Correction factor applied to the number of reported polio cases to account for underreporting. unit: factor polio_surveillance_status: title: Polio surveillance status description_short: "The status of polio surveillance in a given country" - description_key: ["A country is considered to have adequate screening if it has a non-polio AFP rate of at least 2 per 100,000 children aged 15 or under.", "A country is considered to have adequate testing if it has a percentage adequate stool collection of at least 80%.", "Countries are labelled 'low risk' if they were considered low risk by the risk assessment carried out for the [2022 GPEI surveillance action plan](https://polioeradication.org/wp-content/uploads/2022/05/GPSAP-2022-2024-EN.pdf)"] + description_key: + - A country is considered to have adequate screening if it has a non-polio AFP rate of at least 2 per 100,000 children aged 15 or under. + - A country is considered to have adequate testing if it has a percentage adequate stool collection of at least 80%. + - Countries are labelled 'low risk' if they were considered low risk by the risk assessment carried out for the [2022 GPEI surveillance action plan.](https://polioeradication.org/wp-content/uploads/2022/05/GPSAP-2022-2024-EN.pdf) unit: "" #type: ordinal #sort: *surveillance_sort total_cases: title: Total polio cases - description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases + description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases. unit: cases display: numDecimalPlaces: 0 total_cases_per_million: title: Total polio cases per million - description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases per million people + description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases per million people. unit: cases per million display: numDecimalPlaces: 1 estimated_cases: title: Estimated polio cases description_short: The total estimated number of polio cases. - description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al (2010). + description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al. (2010). unit: cases display: numDecimalPlaces: 0 estimated_cases_per_million: title: Estimated polio cases per million description_short: The total estimated number of polio cases per million people. - description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al (2010). + description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al. (2010). unit: cases per million display: numDecimalPlaces: 1 cvdpv1: title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV1) cases - description_short: "The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year" + description_short: The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year. unit: cases display: numDecimalPlaces: 0 cvdpv1_per_million: title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV2) cases per million - description_short: "The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year, per million people" + description_short: The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year, per million people. unit: cases per million display: numDecimalPlaces: 1 cvdpv2: title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases - description_short: "The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year" + description_short: The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year. unit: cases display: numDecimalPlaces: 0 cvdpv2_per_million: title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases per million - description_short: "The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year, per million people" + description_short: The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year, per million people. unit: cases per million display: numDecimalPlaces: 1 cvdpv3: title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases - description_short: "The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year" + description_short: The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year. unit: cases display: numDecimalPlaces: 0 cvdpv3_per_million: title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases per million - description_short: "The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year, per million people" + description_short: The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year, per million people. unit: cases per million display: numDecimalPlaces: 1 diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index a0cf4705a52..3e447934d2f 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -86,8 +86,7 @@ def run(dest_dir: str) -> None: # Add polio surveillance status based on the screening and testing rates. tb = add_screening_and_testing(tb, tb_regions, ds_regions) tb = add_cases_per_million(tb, tb_population) - tb = tb.set_index(["country", "year"], verify_integrity=True) - tb.metadata.short_name = "polio" + tb.format(short_name="polio") # # Save outputs. @@ -262,7 +261,7 @@ def add_screening_and_testing(tb: Table, tb_regions: Dataset, ds_regions: Datase tb = identify_low_risk_countries(tb, tb_regions, ds_regions) # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise - tb["polio_surveillance_status"].metadata.origins = tb["non_polio_afp_rate"].metadata.origins + tb["polio_surveillance_status"] = tb["polio_surveillance_status"].copy_metadata(tb["non_polio_afp_rate"]) return tb @@ -322,5 +321,5 @@ def clean_adequate_stool_collection(tb: Table) -> Table: def remove_pre_2001_data(tb: Table) -> Table: """Remove data from before 2001.""" - tb = tb[tb["year"] >= 2001] + tb = tb[tb["year"] >= 2001].reset_index(drop=True) return tb diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.py b/etl/steps/data/garden/who/2024-04-09/polio_historical.py index 9c549d9fe0b..24975e73222 100644 --- a/etl/steps/data/garden/who/2024-04-09/polio_historical.py +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.py @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None: # tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) - tb = tb.set_index(["country", "year"], verify_integrity=True) + tb = tb.format() # # Save outputs. diff --git a/etl/steps/data/meadow/who/2024-04-08/polio_afp.py b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py index 8f42fb23fa3..e7ba26edcf6 100644 --- a/etl/steps/data/meadow/who/2024-04-08/polio_afp.py +++ b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py @@ -20,7 +20,7 @@ def run(dest_dir: str) -> None: # Process data. # # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. - tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index() + tb = tb.format() # # Save outputs. diff --git a/etl/steps/data/meadow/who/2024-04-09/polio_historical.py b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py index bfb712d668c..684655a90b7 100644 --- a/etl/steps/data/meadow/who/2024-04-09/polio_historical.py +++ b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py @@ -22,11 +22,11 @@ def run(dest_dir: str) -> None: # Process data. # tb = pr.melt(tb, id_vars=["WHO_REGION", "ISO_code", "Cname", "Disease"], var_name="year", value_name="cases") - tb = tb.drop(columns=["WHO_REGION", "ISO_code", "Disease"]) - tb = tb.rename(columns={"Cname": "country"}) + tb = tb.drop(columns=["WHO_REGION", "ISO_code", "Disease"], errors="raise") + tb = tb.rename(columns={"Cname": "country"}, errors="raise") # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. - tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index() + tb = tb.format() # # Save outputs. diff --git a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc index 489761f09f1..e24468c687e 100644 --- a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc +++ b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc @@ -3,9 +3,9 @@ meta: origin: # Data product / Snapshot - title: Polio-Free Countries - GPEI + title: Polio-Free Countries description: |- - This table shows the last recorded case of indigenous wild poliovirus (WPV) by country. As data methodology and quality varied widely across regions and countries in earlier years, this table is based on the best-available sources for the years before 2000. + This dataset shows the last recorded case of indigenous wild poliovirus (WPV) by country. As data methodology and quality varied widely across regions and countries in earlier years, this table is based on the best-available sources for the years before 2000. date_published: "2023" # Citation producer: Global Polio Eradication Initiative diff --git a/snapshots/health/2024-04-12/polio_status.py b/snapshots/health/2024-04-12/polio_status.py index 9608a7c8ec6..f0c03782c79 100644 --- a/snapshots/health/2024-04-12/polio_status.py +++ b/snapshots/health/2024-04-12/polio_status.py @@ -49,10 +49,7 @@ def main(upload: bool) -> None: ], } ) - df_to_file(df, file_path=snap.path) - - # Add file to DVC and upload to S3. - snap.dvc_add(upload=upload) + snap.create_snapshot(data=df, upload=upload) if __name__ == "__main__": diff --git a/snapshots/who/2024-04-08/polio_afp.csv.dvc b/snapshots/who/2024-04-08/polio_afp.csv.dvc index 48fa7acff58..2ee7ee44d99 100644 --- a/snapshots/who/2024-04-08/polio_afp.csv.dvc +++ b/snapshots/who/2024-04-08/polio_afp.csv.dvc @@ -3,13 +3,13 @@ meta: origin: # Data product / Snapshot - title: Polio cases - World Health Organization + title: Polio Cases and Acute Flaccid Paralysis (AFP) Surveillance date_published: "2024-04-04" # Citation producer: World Health Organization citation_full: |- - World Health Organization (2024) + World Health Organization - Polio cases (2024). attribution_short: WHO # Files url_main: https://extranet.who.int/polis/public/CaseCount.aspx @@ -17,8 +17,8 @@ meta: # License license: - name: CC BY 4.0 - url: https://www.who.int/about/policies/terms-of-use + name: CC BY-NC-SA 3.0 IGO + url: https://www.who.int/about/policies/publishing/copyright outs: - md5: 00d57ac66f4507ae66a35fecec365971 diff --git a/snapshots/who/2024-04-09/polio_historical.xls.dvc b/snapshots/who/2024-04-09/polio_historical.xls.dvc index 33169b13d00..10ab583cea4 100644 --- a/snapshots/who/2024-04-09/polio_historical.xls.dvc +++ b/snapshots/who/2024-04-09/polio_historical.xls.dvc @@ -5,21 +5,22 @@ meta: # Data product / Snapshot title: WHO Historical Polio Dataset date_published: "2019-12-10" + description: |- + The World Health Organization (WHO) provides a historical dataset on polio cases, but it is no longer available directly from the WHO website. Instead the web archived version can be downloaded [here](https://web.archive.org/web/20200101000000*/http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1) and it can be accessed by opening in 'Read Only' mode. # Citation producer: World Health Organization citation_full: |- - World Health Organization (2019) + World Health Organization - Historical Polio Dataset (2019). attribution_short: WHO - # Files - url_main: https://web.archive.org/web/20200101000000*/http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1 + url_main: https://www.who.int/news-room/fact-sheets/detail/poliomyelitis date_accessed: 2024-04-09 # License license: - name: CC BY 4.0 - url: https://www.who.int/about/policies/terms-of-use + name: CC BY-NC-SA 3.0 IGO + url: https://www.who.int/about/policies/publishing/copyright outs: - md5: 189201470a046c95b5f38c05a77fd6c2 From d25c23d33817a08e70bd948050add7c29dbe6f0e Mon Sep 17 00:00:00 2001 From: spoonerf Date: Thu, 18 Apr 2024 12:28:14 +0100 Subject: [PATCH 35/35] fixing error --- etl/steps/data/garden/who/2024-04-08/polio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py index 3e447934d2f..fb1b21979e8 100644 --- a/etl/steps/data/garden/who/2024-04-08/polio.py +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -86,7 +86,7 @@ def run(dest_dir: str) -> None: # Add polio surveillance status based on the screening and testing rates. tb = add_screening_and_testing(tb, tb_regions, ds_regions) tb = add_cases_per_million(tb, tb_population) - tb.format(short_name="polio") + tb = tb.format(short_name="polio") # # Save outputs.