diff --git a/dag/health.yml b/dag/health.yml index 7ade31c5ed1..320697744bb 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -512,3 +512,39 @@ steps: - data://garden/demography/2023-03-31/population data://grapher/health/2024-04-02/organ_donation_and_transplantation: - data://garden/health/2024-04-02/organ_donation_and_transplantation + + # Polio AFP surveillance + data://meadow/who/2024-04-08/polio_afp: + - snapshot://who/2024-04-08/polio_afp.csv + + # Polio historical data + data://meadow/who/2024-04-09/polio_historical: + - snapshot://who/2024-04-09/polio_historical.xls + data://garden/who/2024-04-09/polio_historical: + - data://meadow/who/2024-04-09/polio_historical + + # Combining polio datasets + data://garden/who/2024-04-08/polio: + - data://meadow/who/2024-04-08/polio_afp + - data://meadow/who/2024-04-09/polio_historical + - data://garden/wb/2023-04-30/income_groups + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population + - snapshot://fasttrack/latest/gpei.csv + - snapshot://health/2024-04-12/polio_status.csv + data://grapher/who/2024-04-08/polio: + - data://garden/who/2024-04-08/polio + + # Polio certification status + data://meadow/health/2024-04-12/polio_status: + - snapshot://health/2024-04-12/polio_status.csv + + # Polio free countries + data://meadow/health/2024-04-12/polio_free_countries: + - snapshot://health/2024-04-12/polio_free_countries.csv + data://garden/health/2024-04-12/polio_free_countries: + - data://meadow/health/2024-04-12/polio_status + - data://meadow/health/2024-04-12/polio_free_countries + - data://garden/regions/2023-01-01/regions + data://grapher/health/2024-04-12/polio_free_countries: + - data://garden/health/2024-04-12/polio_free_countries diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json new file mode 100644 index 00000000000..8af883c624a --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.countries.json @@ -0,0 +1,217 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guam": "Guam", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Libyan Arab Jamahiriya": "Libya", + "American Samoa": "American Samoa", + "Anguilla": "Anguilla", + "Bermuda": "Bermuda", + "Bolivia": "Bolivia", + "British Virgin Islands": "British Virgin Islands", + "Cayman Islands": "Cayman Islands", + "East Timor": "East Timor", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "Guadeloupe": "Guadeloupe", + "Iran": "Iran", + "Martinique": "Martinique", + "Montserrat": "Montserrat", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "Puerto Rico": "Puerto Rico", + "Reunion": "Reunion", + "Saint Helena": "Saint Helena", + "Swaziland": "Eswatini", + "Tokelau": "Tokelau", + "Turkey": "Turkey", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Venezuela": "Venezuela", + "Wallis and Futuna": "Wallis and Futuna", + "Barabados": "Barbados", + "Cote d\u2019Ivoire": "Cote d'Ivoire", + "Democratic People\u2019s Rep. of Korea": "North Korea", + "Federated States of Micronesia": "Micronesia (country)", + "Former Yugoslav Rep. of Macedonia": "North Macedonia", + "Hong Kong, SAR": "Hong Kong", + "Lao People\u2019s Democratic Republic": "Laos", + "Macao, SAR": "Macao", + "Mariana Islands": "Northern Mariana Islands", + "Palestine N.A.": "Palestine", + "US Virgin Islands": "United States Virgin Islands", + "Democratic Republic of the Congo": "Democratic Republic of Congo" +} \ No newline at end of file diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml new file mode 100644 index 00000000000..18c6ab5715e --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.meta.yml @@ -0,0 +1,30 @@ +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases + processing_level: minor + unit: "" + short_unit: "" + display: + numDecimalPlaces: 0 + +dataset: + update_period_days: 365 + +tables: + polio_free_countries: + variables: + latest_year_wild_polio_case: + title: Latest year of wild polio case + description_short: The most recent year in which a case of wild poliovirus was detected in a country. + unit: "" + display: + numDecimalPlaces: 0 + status: + title: Polio-free status + description_short: The status of a country in terms of polio eradication. + unit: "" + display: + numDecimalPlaces: 0 diff --git a/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..f731727875c --- /dev/null +++ b/etl/steps/data/garden/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,116 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from itertools import product + +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr + +from etl.data_helpers.geo import harmonize_countries, list_members_of_region +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +LATEST_YEAR = 2023 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("polio_free_countries") + tb = ds_meadow["polio_free_countries"].reset_index() + + ##### Temporary fix - we remove West Bank and Gaza as there is both data for West Bank and Gaza _and_ Palestine N.A (national authority). + ##### I'm not sure how we should treat these but for now I will just stick with the entity that has the latest value, so Palestine N.A. + + tb = tb[tb["country"] != "West Bank and Gaza"] + ##### There are also two values for Somalia, I will drop the least recent one + tb = tb[~((tb["country"] == "Somalia") & (tb["year"] == "2000"))] + + # Loading the polio status data for WHO regions + ds_region_status = paths.load_dataset(short_name="polio_status", channel="meadow") + tb_region_status = ds_region_status["polio_status"].reset_index() + + # Loading in the regions table so we know which countries are in each WHO region + ds_regions = paths.load_dataset("regions") + tb_regions = ds_regions["regions"].reset_index() + who_regions = tb_regions[(tb_regions["defined_by"] == "who") & (tb_regions["region_type"] == "aggregate")] + + tb = harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + # Assign polio free countries. + tb = define_polio_free(tb, latest_year=LATEST_YEAR) + + tb = add_polio_region_certification(tb, tb_region_status, who_regions, ds_regions) + # Set an index and sort. + tb = tb.format() + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() + + +def add_polio_region_certification( + tb: Table, tb_region_status: Table, who_regions: Table, ds_regions: Dataset +) -> Table: + # Append "(WHO)" suffix to the "who_region" to match the region names in the who_regions table + tb_region_status["who_region"] = tb_region_status["who_region"].astype(str) + " (WHO)" + + # Correct mapping of regions to status updates by ensuring 'region' matches the modified 'who_region' entries + for region in who_regions["name"]: + # Generate country list for the current region + country_list = list_members_of_region(region=region, ds_regions=ds_regions) + if not country_list: + raise ValueError(f"No countries found for region {region}") + + # Find the year of certification for the current region + year_certified = tb_region_status.loc[tb_region_status["who_region"] == region, "year_certified_polio_free"] + + # Check if there is a valid year of certification + if not year_certified.empty and year_certified.notna().all(): + year_certified_int = int(year_certified.iloc[0]) + # Set the status for all relevant countries and years + tb.loc[ + (tb["country"].isin(country_list)) & (tb["year"] >= year_certified_int), "status" + ] = "WHO Region certified polio-free" + + return tb + + +def define_polio_free(tb: Table, latest_year: int) -> Table: + """Define the polio free countries table.""" + tb = tb.copy() + # Clean the data + tb["year"] = tb["year"].astype(str) + + # Drop countries with missing values explicitly copying to avoid setting on a slice warning + tb = tb[tb["year"] != "data not available"] + + # Change 'pre 1985' to 1984 and 'ongoing' to LATEST_YEAR + 1 + tb.loc[tb["year"] == "pre 1985", "year"] = "1984" + tb.loc[tb["year"] == "ongoing", "year"] = str(latest_year) + + tb["year"] = tb["year"].astype(int) + # Rename year to latest year + tb = tb.rename(columns={"year": "latest_year_wild_polio_case"}) + tb["year"] = latest_year + # Create a product of all countries and all years from 1910 to LATEST_YEAR + tb_prod = Table(product(tb["country"].unique(), range(1910, latest_year + 1)), columns=["country", "year"]) + tb_prod = tb_prod.copy_metadata(from_table=tb) + + # Define polio status based on the year comparison + tb_prod["status"] = tb_prod.apply( + lambda row: "Endemic" + if row["year"] < tb[tb["country"] == row["country"]]["latest_year_wild_polio_case"].min() + else "Polio-free (not certified)", + axis=1, + ) + # Merge the two tables + tb = pr.merge(tb, tb_prod, on=["country", "year"], how="right") + # Issues with status not having origins or source, not sure this is the best way to solve + tb["status"] = tb["status"].copy_metadata(tb["latest_year_wild_polio_case"]) + return tb diff --git a/etl/steps/data/garden/who/2024-04-08/polio.countries.json b/etl/steps/data/garden/who/2024-04-08/polio.countries.json new file mode 100644 index 00000000000..1a59d51c260 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio.countries.json @@ -0,0 +1,213 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahrain": "Bahrain", + "Bahamas": "Bahamas", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "China, Hong Kong SAR": "Hong Kong", + "China, Macao SAR": "Macao", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Moldova": "Moldova", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Reunion": "Reunion", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Serbia and Montenegro": "Serbia and Montenegro", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "West Bank and Gaza": "Palestine", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Libyan Arab Jamahiriya": "Libya", + "The former Yugoslav Republic of Macedonia": "North Macedonia", + "Timor Leste": "East Timor" +} \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json b/etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json new file mode 100644 index 00000000000..94f87976655 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio.excluded_countries.json @@ -0,0 +1,7 @@ +[ + "CAREC", + "Carec", + "Pacific Island Countries", + "Pacific Island countries", + "Yugoslavia" +] \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-08/polio.meta.yml b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml new file mode 100644 index 00000000000..9bab6dc40e2 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio.meta.yml @@ -0,0 +1,157 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases +# surveillance_sort: &surveillance_sort +# - Inadequate screening and testing +# - Inadequate screening +# - Inadequate testing +# - Adequate screening and testing +# - Low risk +# - nan + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + polio: + variables: + afp_cases: + title: Acute Flaccid Paralysis (AFP) cases + description_short: Number of new cases of acute flaccid paralysis (AFP) reported in a given year. + unit: cases + display: + numDecimalPlaces: 0 + afp_cases_per_million: + title: Acute Flaccid Paralysis (AFP) cases per million + description_short: Number of new cases of acute flaccid paralysis (AFP) reported in a given year per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + non_polio_afp_rate: + title: Non-Polio acute flaccid paralysis (afp) rate + description_short: The number of cases of acute flaccid paralysis (AFP) per 100,000 children aged 15 or under, not attributed to polio, that were detected and reported to the WHO. + description_key: + - Acute flaccid paralysis (AFP) surveillance is the gold standard for detecting cases of poliomyelitis. + - At least one case of non-polio AFP should be detected annually per 100,000 population aged less than 15 years. In endemic regions, to ensure even higher sensitivity, this rate should be two per 100 000. + unit: cases per 100,000 children + pct_adequate_stool_collection: + title: Adequate stool collection (%) + description_short: The share of acute flaccid paralysis (AFP) cases, where stool samples were tested for poliovirus and reported to the WHO. + unit: "%" + short_unit: "%" + pending: + title: Pending + description_short: The number of acute flaccid paralysis (AFP) cases pending classification by the WHO. + unit: cases + wild_poliovirus_cases: + title: Wild Poliovirus cases + description_short: "The number of cases of wild poliovirus detected in a given year" + unit: cases + display: + numDecimalPlaces: 0 + wild_poliovirus_cases_per_million: + title: Wild Poliovirus cases per million + description_short: The number of cases of wild poliovirus detected in a given year per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + cvdpv_cases: + title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases + description_short: The number of cases of circulating vaccine-derived poliovirus detected in a given year. + unit: cases + display: + numDecimalPlaces: 0 + cvdpv_cases_per_million: + title: Circulating Vaccine-Derived Poliovirus (cVDPV) cases per million + description_short: The number of cases of circulating vaccine-derived poliovirus detected in a given year per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + compatibles: + title: Compatibles + unit: "" + footnote: + title: Footnote + unit: "" + correction_factor: + title: Correction factor + description_short: Correction factor applied to the number of reported polio cases to account for underreporting. + unit: factor + polio_surveillance_status: + title: Polio surveillance status + description_short: "The status of polio surveillance in a given country" + description_key: + - A country is considered to have adequate screening if it has a non-polio AFP rate of at least 2 per 100,000 children aged 15 or under. + - A country is considered to have adequate testing if it has a percentage adequate stool collection of at least 80%. + - Countries are labelled 'low risk' if they were considered low risk by the risk assessment carried out for the [2022 GPEI surveillance action plan.](https://polioeradication.org/wp-content/uploads/2022/05/GPSAP-2022-2024-EN.pdf) + unit: "" + #type: ordinal + #sort: *surveillance_sort + total_cases: + title: Total polio cases + description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases. + unit: cases + display: + numDecimalPlaces: 0 + total_cases_per_million: + title: Total polio cases per million + description_short: The sum of wild poliovirus cases and circulating vaccine derived poliovirus cases per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + estimated_cases: + title: Estimated polio cases + description_short: The total estimated number of polio cases. + description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al. (2010). + unit: cases + display: + numDecimalPlaces: 0 + estimated_cases_per_million: + title: Estimated polio cases per million + description_short: The total estimated number of polio cases per million people. + description_processing: Total estimated cases are calculated from reported cases using correction factors from Tebbens et al. (2010). + unit: cases per million + display: + numDecimalPlaces: 1 + cvdpv1: + title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV1) cases + description_short: The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year. + unit: cases + display: + numDecimalPlaces: 0 + cvdpv1_per_million: + title: Circulating Vaccine-Derived Poliovirus type 1 (cVDPV2) cases per million + description_short: The number of cases of circulating vaccine-derived poliovirus type 1 detected in a given year, per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + cvdpv2: + title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases + description_short: The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year. + unit: cases + display: + numDecimalPlaces: 0 + cvdpv2_per_million: + title: Circulating Vaccine-Derived Poliovirus type 2 (cVDPV2) cases per million + description_short: The number of cases of circulating vaccine-derived poliovirus type 2 detected in a given year, per million people. + unit: cases per million + display: + numDecimalPlaces: 1 + cvdpv3: + title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases + description_short: The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year. + unit: cases + display: + numDecimalPlaces: 0 + cvdpv3_per_million: + title: Circulating Vaccine-Derived Poliovirus type 3 (cVDPV3) cases per million + description_short: The number of cases of circulating vaccine-derived poliovirus type 3 detected in a given year, per million people. + unit: cases per million + display: + numDecimalPlaces: 1 diff --git a/etl/steps/data/garden/who/2024-04-08/polio.py b/etl/steps/data/garden/who/2024-04-08/polio.py new file mode 100644 index 00000000000..fb1b21979e8 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-08/polio.py @@ -0,0 +1,325 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr + +from etl.data_helpers.geo import add_regions_to_table, harmonize_countries, list_members_of_region +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Year to use for the screening and testing rates. +# Should be the most recent year of complete data. +SCREENING_YEAR = 2023 + +REGIONS = ["North America", "South America", "Europe", "Africa", "Asia", "Oceania", "World"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow acute flaccid paralysis dataset. + ds_meadow = paths.load_dataset("polio_afp") + # Load historical polio dataset + ds_historical = paths.load_dataset("polio_historical") + # Load population data to calculate cases per million population + ds_population = paths.load_dataset("population") + tb_population = ds_population["population"].reset_index() + # Load fasttrack Global Polio Eradication Initiative on circulating vaccine derived polio cases + snap_cvdpv = paths.load_snapshot("gpei.csv") + tb_cvdpv = snap_cvdpv.read() + # Dropping this as the total_cvdpv is also in the polio_afp table and has more historical data + tb_cvdpv = tb_cvdpv.drop(columns=["total_cvdpv"]) + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + tb_regions = ds_regions["regions"].reset_index() + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # Read table from meadow dataset. + tb = ds_meadow["polio_afp"].reset_index() + tb_hist = ds_historical["polio_historical"].reset_index() + tb_hist = tb_hist.rename(columns={"cases": "total_cases"}) + # Only need this for data prior to 2001 + tb_hist = tb_hist[tb_hist["year"] < 2001] + + # Remove data from before 2001. + tb = remove_pre_2001_data(tb) + # Remove values > 100% for "Adequate stool collection". + tb = clean_adequate_stool_collection(tb) + # Add total cases + tb["total_cases"] = tb["wild_poliovirus_cases"] + tb["cvdpv_cases"] + # Need to deal with overlapping years + tb = pr.concat([tb_hist, tb], axis=0) + tb = harmonize_countries( + df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path + ) + tb = tb.merge(tb_cvdpv, on=["country", "year"], how="left") + # Add region aggregates. + tb_reg = add_regions_to_table( + tb[ + [ + "country", + "year", + "afp_cases", + "wild_poliovirus_cases", + "cvdpv_cases", + "total_cases", + "cvdpv1", + "cvdpv2", + "cvdpv3", + ] + ], + regions=REGIONS, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + min_num_values_per_year=1, + ) + tb_reg = tb_reg[tb_reg["country"].isin(REGIONS)] + tb = pr.concat([tb, tb_reg], axis=0) + # Add correction factor to estimate polio cases based on reported cases. + tb = add_correction_factor(tb) + tb["estimated_cases"] = tb["total_cases"] * tb["correction_factor"] + # Add polio surveillance status based on the screening and testing rates. + tb = add_screening_and_testing(tb, tb_regions, ds_regions) + tb = add_cases_per_million(tb, tb_population) + tb = tb.format(short_name="polio") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def add_cases_per_million(tb: Table, tb_population: Table) -> Table: + """ + Add cases per million population for each country, for the columns concerning each type of polio cases. + """ + tb_population = tb_population[["country", "year", "population"]] + tb = tb.merge(tb_population, on=["country", "year"], how="left") + + cols_to_divide = [ + "afp_cases", + "wild_poliovirus_cases", + "cvdpv_cases", + "total_cases", + "estimated_cases", + "cvdpv1", + "cvdpv2", + "cvdpv3", + ] + for col in cols_to_divide: + tb[f"{col}_per_million"] = tb[col] / tb["population"] * 1_000_000 + + tb = tb.drop(columns=["population"]) + return tb + + +def list_of_who_countries(tb_regions: Table, ds_regions: Dataset) -> list: + """List of countries as defined by WHO.""" + who_countries = [] + who_regions = tb_regions[tb_regions["defined_by"] == "who"] + for region in who_regions["name"]: + country_list = list_members_of_region(region=region, ds_regions=ds_regions) + if not country_list: + raise ValueError(f"No countries found for region {region}") + who_countries.extend(country_list) + return who_countries + + +def identify_low_risk_countries(tb: Table, tb_regions: Table, ds_regions: Dataset) -> Table: + # Identify low-risk countries (where the surveillance status can be disregarded) + # High risk entities are those identified in the table on page 48 in this document: https://polioeradication.org/wp-content/uploads/2022/04/GPSAP-2022-2024-EN.pdf + higher_risk_entities = [ + "Chad", + "Democratic Republic of Congo", + "Ethiopia", + "Niger", + "Nigeria", + "Afghanistan", + "Pakistan", + "Somalia", + "Angola", + "Burkina Faso", + "Cameroon", + "Central African Republic", + "Guinea", + "Kenya", + "Mali", + "South Sudan", + "Yemen", + "Benin", + "Cote d'Ivoire", + "Equatorial Guinea", + "Guinea-Bissau", + "Madagascar", + "Mozambique", + "Togo", + "Iraq", + "Sudan", + "Syria", + "Myanmar", + "Papua New Guinea", + "Philippines", + "Burundi", + "Congo", + "Gabon", + "Gambia", + "Ghana", + "Liberia", + "Senegal", + "Sierra Leone", + "Uganda", + "Zambia", + "Djibouti", + "Egypt", + "Iran", + "Libya", + "Tajikistan", + "Ukraine", + "Indonesia", + "Nepal", + "Haiti", + "Laos", + "China", + "Eritrea", + "Malawi", + "Mauritania", + "Namibia", + "Rwanda", + "Tanzania", + "Zimbabwe", + "Lebanon", + "Bangladesh", + "India", + "East Timor", + "Bolivia", + "Cambodia", + "Malaysia", + ] + + difference = [item for item in higher_risk_entities if item not in tb["country"].unique()] + assert difference == [], f"Entities in the high-risk list that are not in the dataset: {difference}" + + # Define the condition for which countries are not in high-risk entities + not_high_risk = ~tb["country"].isin(higher_risk_entities) + + # Define the condition for screening year + is_screening_year = tb["year"] == SCREENING_YEAR + + # Combine conditions and update 'polio_surveillance_status' for matching rows + tb.loc[not_high_risk & is_screening_year, "polio_surveillance_status"] = "Low risk" + + return tb + + +def add_screening_and_testing(tb: Table, tb_regions: Dataset, ds_regions: Dataset) -> Table: + """ + Adds the polio surveillance status based on the screening and testing rates. + For use in this chart: https://ourworldindata.org/grapher/polio-screening-and-testing + + Parameters: + - tb: table containing polio surveillance data. + - year: Specific year to filter the data. If None, uses current year. + + Returns: + - Modified table with a new column for polio surveillance status. + """ + # Ensuring we have all the countries in the WHO regions - even if there isn't other polio data for them + who_countries = list_of_who_countries(tb_regions, ds_regions) + who_tb = Table({"country": who_countries, "year": SCREENING_YEAR}).copy_metadata(from_table=tb) + tb = tb.merge(who_tb, on=["country", "year"], how="outer") + + # Add the polio surveillance status based on the screening and testing rates + tb.loc[ + (tb["non_polio_afp_rate"] >= 2.0) + & (tb["pct_adequate_stool_collection"] >= 80) + & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Adequate screening and testing" + tb.loc[ + (tb["non_polio_afp_rate"] >= 2.0) & (tb["pct_adequate_stool_collection"] < 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate testing" + tb.loc[ + (tb["non_polio_afp_rate"] < 2.0) & (tb["pct_adequate_stool_collection"] >= 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate screening" + tb.loc[ + (tb["non_polio_afp_rate"] < 2.0) & (tb["pct_adequate_stool_collection"] < 80) & (tb["year"] == SCREENING_YEAR), + "polio_surveillance_status", + ] = "Inadequate screening and testing" + + tb = identify_low_risk_countries(tb, tb_regions, ds_regions) + # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise + tb["polio_surveillance_status"] = tb["polio_surveillance_status"].copy_metadata(tb["non_polio_afp_rate"]) + return tb + + +def add_correction_factor(tb: Table) -> Table: + """ + Adding the correction factor to estimate polio cases based on reported cases. + + Following Tebbens et al (2011) -https://www.sciencedirect.com/science/article/pii/S0264410X10014957?via%3Dihub + + The correction factor is 7 for all years before 1996. + The correction factor is 1.11 for all countries when 1996 >= year <= 2000 if the 'non_polio_afp_rate' is < 1 OR 'percent_adequate_stool_collection' < 60, then the correction factor = 7. + If the 'non_polio_afp_rate' is < 2 OR 'percent_adequate_stool_collection' < 80, then the correction factor = 2. If the 'non_polio_afp_rate' is >= 2 OR 'percent_adequate_stool_collection' >= 80, then the correction factor = 1.11. + If both 'non_polio_afp_rate' and 'percent_adequate_stool_collection' are missing then the correction factor is 7. + + There are some manual changes we make: + + - Namibia had 'percent_adequate_stool_collection' > 100 in 2011 and 2014 but for other years it's correction factor is 1.11 so we set it as 1.11 for 2011 and 2014. + + - For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988. + + (We set the correction factor as NA for all of 2021 as the values of 'percent_adequate_stool_collection' seemed unreliable in this year.) + + """ + # tb["correction_factor"] = pd.NA + # Correction factor for years 1996-2000 is 1.11. + tb.loc[(tb["year"] >= 1996) & (tb["year"] <= 2000), "correction_factor"] = 1.11 + # If the 'non_polio_afp_rate' is < 1 OR 'percent_adequate_stool_collection' < 60, then the correction factor = 7. + tb.loc[(tb["non_polio_afp_rate"] < 1.0) | (tb["pct_adequate_stool_collection"] < 60), "correction_factor"] = 7.0 + # If the 'non_polio_afp_rate' is < 2 OR 'percent_adequate_stool_collection' < 80, then the correction factor = 2. + tb.loc[(tb["non_polio_afp_rate"] < 2.0) | (tb["pct_adequate_stool_collection"] < 80), "correction_factor"] = 2.0 + # If the 'non_polio_afp_rate' is >= 2 OR 'percent_adequate_stool_collection' >= 80, then the correction factor = 1.11. + tb.loc[(tb["non_polio_afp_rate"] >= 2.0) & (tb["pct_adequate_stool_collection"] >= 80), "correction_factor"] = 1.11 + # If both 'non_polio_afp_rate' and 'percent_adequate_stool_collection' are missing then the correction factor is 7. + tb.loc[(tb["non_polio_afp_rate"].isna()) & (tb["pct_adequate_stool_collection"].isna()), "correction_factor"] = 7.0 + # Correction factor for years before 1996 is 7. + tb.loc[tb["year"] < 1996, "correction_factor"] = 7.0 + + # tb.loc[tb["year"] == 2021, "correction_factor"] = np.nan + + # Namibia had 'percent_adequate_stool_collection' > 100 in 2011 and 2014 but for other years it's correction factor is 1.11 so we set it as 1.11 for 2011 and 2014. + tb.loc[(tb["country"] == "Namibia") & (tb["year"].isin([2011, 2014])), "correction_factor"] = 1.11 + # For China 1989-92 we set the correction factor to 1.11 and in Oman in 1988. + tb.loc[(tb["country"] == "China") & (tb["year"].isin([1989, 1990, 1991, 1992])), "correction_factor"] = 1.11 + tb.loc[(tb["country"] == "Oman") & (tb["year"].isin([1988])), "correction_factor"] = 1.11 + # Not sure if this is the best way to handle this, the code fails because this indicator doesn't have origins otherwise + tb["correction_factor"].metadata.origins = tb["non_polio_afp_rate"].metadata.origins + return tb + + +def clean_adequate_stool_collection(tb: Table) -> Table: + """ + Some values for "Adequate stool collection" are over 100%, we should set these to NA. + """ + tb.loc[tb["pct_adequate_stool_collection"] > 100, "pct_adequate_stool_collection"] = pd.NA + return tb + + +def remove_pre_2001_data(tb: Table) -> Table: + """Remove data from before 2001.""" + tb = tb[tb["year"] >= 2001].reset_index(drop=True) + return tb diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json b/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json new file mode 100644 index 00000000000..383f9f88205 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.countries.json @@ -0,0 +1,196 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia (Plurinational State of)": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic People's Republic of Korea": "North Korea", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Republic of Korea": "South Korea", + "Republic of Moldova": "Moldova", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United Republic of Tanzania": "Tanzania", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe" +} \ No newline at end of file diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml b/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml new file mode 100644 index 00000000000..2882d8f9b3c --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.meta.yml @@ -0,0 +1,20 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + - Eradication of Diseases + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + polio_historical: + variables: + cases: + title: Polio cases + description_short: Number of new cases of polio reported in a given year. + unit: cases diff --git a/etl/steps/data/garden/who/2024-04-09/polio_historical.py b/etl/steps/data/garden/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..24975e73222 --- /dev/null +++ b/etl/steps/data/garden/who/2024-04-09/polio_historical.py @@ -0,0 +1,36 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("polio_historical") + + # Read table from meadow dataset. + tb = ds_meadow["polio_historical"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py b/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..b4ae4f6b3b0 --- /dev/null +++ b/etl/steps/data/grapher/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,21 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("polio_free_countries") + tb = ds_garden["polio_free_countries"] + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/who/2024-04-08/polio.py b/etl/steps/data/grapher/who/2024-04-08/polio.py new file mode 100644 index 00000000000..89a65e7d6af --- /dev/null +++ b/etl/steps/data/grapher/who/2024-04-08/polio.py @@ -0,0 +1,32 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("polio") + + # Read table from garden dataset. + tb = ds_garden["polio"] + + # + # Process data. + # + tb = tb.drop(columns="footnote") + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..9d808e0ffef --- /dev/null +++ b/etl/steps/data/meadow/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,22 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("polio_free_countries.csv") + tb = snap.read() + tb = tb.format() + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/health/2024-04-12/polio_status.py b/etl/steps/data/meadow/health/2024-04-12/polio_status.py new file mode 100644 index 00000000000..b44471e4445 --- /dev/null +++ b/etl/steps/data/meadow/health/2024-04-12/polio_status.py @@ -0,0 +1,27 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot and read its data. + snap = paths.load_snapshot("polio_status.csv") + tb = snap.read() + + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(keys=["who_region", "year_certified_polio_free"]) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/who/2024-04-08/polio_afp.py b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py new file mode 100644 index 00000000000..e7ba26edcf6 --- /dev/null +++ b/etl/steps/data/meadow/who/2024-04-08/polio_afp.py @@ -0,0 +1,32 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("polio_afp.csv") + + # Load data from snapshot. + tb = snap.read() + tb = tb.rename(columns={"Country / Territory / Region": "country", "Year": "year"}) + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/who/2024-04-09/polio_historical.py b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..684655a90b7 --- /dev/null +++ b/etl/steps/data/meadow/who/2024-04-09/polio_historical.py @@ -0,0 +1,38 @@ +"""Load a snapshot and create a meadow dataset.""" + +from owid.catalog import processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("polio_historical.xls") + + # Load data from snapshot. + tb = snap.read(sheet_name="Polio") + + # + # Process data. + # + tb = pr.melt(tb, id_vars=["WHO_REGION", "ISO_code", "Cname", "Disease"], var_name="year", value_name="cases") + tb = tb.drop(columns=["WHO_REGION", "ISO_code", "Disease"], errors="raise") + tb = tb.rename(columns={"Cname": "country"}, errors="raise") + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/health/2024-04-12/polio_free_countries.csv.dvc b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc new file mode 100644 index 00000000000..e24468c687e --- /dev/null +++ b/snapshots/health/2024-04-12/polio_free_countries.csv.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Polio-Free Countries + description: |- + This dataset shows the last recorded case of indigenous wild poliovirus (WPV) by country. As data methodology and quality varied widely across regions and countries in earlier years, this table is based on the best-available sources for the years before 2000. + date_published: "2023" + # Citation + producer: Global Polio Eradication Initiative + citation_full: |- + Global Polio Eradication Initiative (2023) + + # Files + url_main: https://polioeradication.org/where-we-work/polio-free-countries/ + date_accessed: 2024-04-12 + + # License + license: + name: CC BY-NC-SA 3.0 IGO + url: https://polioeradication.org/terms-of-use/ + +outs: + - md5: 3670959ed02bcdb84fc0080734fc1bf7 + size: 5123 + path: polio_free_countries.csv diff --git a/snapshots/health/2024-04-12/polio_free_countries.py b/snapshots/health/2024-04-12/polio_free_countries.py new file mode 100644 index 00000000000..74d8e6c0646 --- /dev/null +++ b/snapshots/health/2024-04-12/polio_free_countries.py @@ -0,0 +1,32 @@ +"""Script to create a snapshot of dataset. + +The data is from this page: https://polioeradication.org/where-we-work/polio-free-countries/ + +The table was copied into a csv and rearranged so that it only has two columns, country and year. + +Then this was uploaded to snapshot. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"health/{SNAPSHOT_VERSION}/polio_free_countries.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/health/2024-04-12/polio_status.csv.dvc b/snapshots/health/2024-04-12/polio_status.csv.dvc new file mode 100644 index 00000000000..6a1897c27ef --- /dev/null +++ b/snapshots/health/2024-04-12/polio_status.csv.dvc @@ -0,0 +1,26 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Global Polio Eradication Initiative - Certification Status + date_published: "2021" + # Citation + producer: Global Polio Eradication Initiative + citation_full: |- + Global Polio Eradication Initiative (2024) + attribution: Global Polio Eradication Initiative (2024) + attribution_short: GPEI + + # Files + url_main: https://polioeradication.org/ + date_accessed: 2024-04-12 + + # License + license: + name: https://polioeradication.org/terms-of-use/ + url: https://polioeradication.org/terms-of-use/ +outs: + - md5: e9052b2095a1c01afe2f954eda183344 + size: 140 + path: polio_status.csv diff --git a/snapshots/health/2024-04-12/polio_status.py b/snapshots/health/2024-04-12/polio_status.py new file mode 100644 index 00000000000..f0c03782c79 --- /dev/null +++ b/snapshots/health/2024-04-12/polio_status.py @@ -0,0 +1,56 @@ +"""Script to create a snapshot of dataset. + +Data are transcribed from this webpage: + +https://polioeradication.org/polio-today/preparing-for-a-polio-free-world/certification/ + +""" + +from pathlib import Path + +import click +import pandas as pd +from owid.datautils.io import df_to_file + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"health/{SNAPSHOT_VERSION}/polio_status.csv") + + df = pd.DataFrame( + data={ + "who_region": [ + "Africa", + "Americas", + "South-East Asia", + "Europe", + "Eastern Mediterranean", + "Western Pacific", + ], + "year_certified_polio_free": [ + 2020, + 1994, + 2014, + 2002, + pd.NA, + 2000, + ], + } + ) + snap.create_snapshot(data=df, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/who/2024-04-08/polio_afp.csv.dvc b/snapshots/who/2024-04-08/polio_afp.csv.dvc new file mode 100644 index 00000000000..2ee7ee44d99 --- /dev/null +++ b/snapshots/who/2024-04-08/polio_afp.csv.dvc @@ -0,0 +1,26 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Polio Cases and Acute Flaccid Paralysis (AFP) Surveillance + date_published: "2024-04-04" + + # Citation + producer: World Health Organization + citation_full: |- + World Health Organization - Polio cases (2024). + attribution_short: WHO + # Files + url_main: https://extranet.who.int/polis/public/CaseCount.aspx + date_accessed: 2024-04-08 + + # License + license: + name: CC BY-NC-SA 3.0 IGO + url: https://www.who.int/about/policies/publishing/copyright + +outs: + - md5: 00d57ac66f4507ae66a35fecec365971 + size: 156591 + path: polio_afp.csv diff --git a/snapshots/who/2024-04-08/polio_afp.py b/snapshots/who/2024-04-08/polio_afp.py new file mode 100644 index 00000000000..aae52da6c33 --- /dev/null +++ b/snapshots/who/2024-04-08/polio_afp.py @@ -0,0 +1,37 @@ +"""Script to create a snapshot of dataset. + +To find the data needed to run this step following these steps: + + - Go to https://extranet.who.int/polis/public/CaseCount.aspx + - Select 'World' in the Region list + - Select all countries in the year of onset list (you may need to use cmd+a to do this) + - Ensure the 'Country Detail' box is checked + - Click 'Show data' + - Select the outputted table and copy it to a CSV file, e.g. in excel + - This is the local file to be loaded in the snapshot + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"who/{SNAPSHOT_VERSION}/polio_afp.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/who/2024-04-09/polio_historical.py b/snapshots/who/2024-04-09/polio_historical.py new file mode 100644 index 00000000000..bac3f8c044c --- /dev/null +++ b/snapshots/who/2024-04-09/polio_historical.py @@ -0,0 +1,31 @@ +"""Script to create a snapshot of dataset. + +The data is no longer available from the WHO but it is available on web archive e.g. here "https://web.archive.org/web/20200713223806/http://www.who.int/immunization/monitoring_surveillance/data/incidence_series.xls" + +It can be downloaded and then used to create a snapshot from the local file. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"who/{SNAPSHOT_VERSION}/polio_historical.xls") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/who/2024-04-09/polio_historical.xls.dvc b/snapshots/who/2024-04-09/polio_historical.xls.dvc new file mode 100644 index 00000000000..10ab583cea4 --- /dev/null +++ b/snapshots/who/2024-04-09/polio_historical.xls.dvc @@ -0,0 +1,28 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: WHO Historical Polio Dataset + date_published: "2019-12-10" + description: |- + The World Health Organization (WHO) provides a historical dataset on polio cases, but it is no longer available directly from the WHO website. Instead the web archived version can be downloaded [here](https://web.archive.org/web/20200101000000*/http://www.who.int/entity/immunization/monitoring_surveillance/data/incidence_series.xls?ua=1) and it can be accessed by opening in 'Read Only' mode. + + # Citation + producer: World Health Organization + citation_full: |- + World Health Organization - Historical Polio Dataset (2019). + attribution_short: WHO + # Files + url_main: https://www.who.int/news-room/fact-sheets/detail/poliomyelitis + date_accessed: 2024-04-09 + + # License + license: + name: CC BY-NC-SA 3.0 IGO + url: https://www.who.int/about/policies/publishing/copyright + +outs: + - md5: 189201470a046c95b5f38c05a77fd6c2 + size: 612864 + path: polio_historical.xls