📊 h5n1: regular update (#2700)

* snapshot * update * add harmonization
owid · May 28, 2024 · 917942d · 917942d
1 parent dd7f559
commit 917942d
Show file tree

Hide file tree

Showing 8 changed files with 367 additions and 0 deletions.
diff --git a/dag/health.yml b/dag/health.yml
@@ -609,3 +609,12 @@ steps:
     - data://meadow/neglected_tropical_diseases/2024-05-18/funding
   data://grapher/neglected_tropical_diseases/2024-05-18/funding:
     - data://garden/neglected_tropical_diseases/2024-05-18/funding
+
+  # WHO Avian Influenza H5N1
+  data://meadow/who/latest/avian_influenza_ah5n1:
+    - snapshot://who/latest/avian_influenza_ah5n1.csv
+  data://garden/who/latest/avian_influenza_ah5n1:
+    - data://meadow/who/latest/avian_influenza_ah5n1
+    - data://garden/regions/2023-01-01/regions
+  data://grapher/who/latest/avian_influenza_ah5n1:
+    - data://garden/who/latest/avian_influenza_ah5n1
diff --git a/etl/steps/data/garden/who/latest/avian_influenza_ah5n1.countries.json b/etl/steps/data/garden/who/latest/avian_influenza_ah5n1.countries.json
@@ -0,0 +1,26 @@
+{
+  "Azerbaijan": "Azerbaijan",
+  "Bangladesh": "Bangladesh",
+  "Cambodia": "Cambodia",
+  "Canada": "Canada",
+  "Chile": "Chile",
+  "China": "China",
+  "Djibouti": "Djibouti",
+  "Ecuador": "Ecuador",
+  "Egypt": "Egypt",
+  "India": "India",
+  "Indonesia": "Indonesia",
+  "Iraq": "Iraq",
+  "Laos": "Laos",
+  "Myanmar": "Myanmar",
+  "Nepal": "Nepal",
+  "Nigeria": "Nigeria",
+  "Pakistan": "Pakistan",
+  "Spain": "Spain",
+  "Thailand": "Thailand",
+  "Turkey": "Turkey",
+  "United Kingdom": "United Kingdom",
+  "United States": "United States",
+  "Vietnam": "Vietnam",
+  "Australia": "Australia"
+}
diff --git a/etl/steps/data/garden/who/latest/avian_influenza_ah5n1.meta.yml b/etl/steps/data/garden/who/latest/avian_influenza_ah5n1.meta.yml
@@ -0,0 +1,24 @@
+tables:
+  avian_influenza_ah5n1_month:
+    variables:
+      avian_cases_month:
+        title: Human cases with highly pathogenic avian influenza A/H5N1 (monthly)
+        unit: cases
+        description_short: |-
+          Monthly number of human cases with highly pathogenic avian influenza A/H5N1.
+        processing_level: minor
+        display:
+          numDecimalPlaces: 0
+          conversionFactor: 1
+
+  avian_influenza_ah5n1_year:
+    variables:
+      avian_cases_year:
+        title: Human cases with highly pathogenic avian influenza A/H5N1 (yearly)
+        unit: cases
+        description_short: |-
+          Yearly number of human cases with highly pathogenic avian influenza A/H5N1.
+        processing_level: minor
+        display:
+          numDecimalPlaces: 0
+          conversionFactor: 1
diff --git a/etl/steps/data/garden/who/latest/avian_influenza_ah5n1.py b/etl/steps/data/garden/who/latest/avian_influenza_ah5n1.py
@@ -0,0 +1,141 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+
+import owid.catalog.processing as pr
+import pandas as pd
+from owid.catalog import Dataset, Table
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+# Regions
+REGIONS = [
+    "Asia",
+    "Africa",
+    "North America",
+    "South America",
+    "Europe",
+    "Oceania",
+]
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("avian_influenza_ah5n1")
+    # Load regions dataset.
+    ds_regions = paths.load_dataset("regions")
+    # Read table from meadow dataset.
+    tb = ds_meadow["avian_influenza_ah5n1"].reset_index()
+
+    #
+    # Process data.
+    #
+    mask = tb["range"] == "All"
+    tb_year = tb[mask].drop(columns=["range"])
+    tb_month = tb[~mask].drop(columns=["range"])
+
+    # Obtain date
+    ## Yearly data
+    tb_year = tb_year.rename(columns={"month": "date"})
+    ## Monthly data
+    # date_1 = pd.to_datetime(tb_month["month"], format="%b-%y", errors="coerce")
+    # date_2 = pd.to_datetime(tb_month["month"], format="%y-%b", errors="coerce")
+    # date_3 = pd.to_datetime("200" + tb_month["month"].astype(str), format="%Y-%b", errors="coerce")
+    # tb_month["date"] = date_1.fillna(date_2).fillna(date_3)
+    tb_month["date"] = pd.to_datetime(tb_month["month"], format="%m/%d/%Y")
+    assert tb_month["date"].notna().all(), "Some dates could not be parsed."
+    tb_month = tb_month.drop(columns=["month"])
+
+    # Harmonize country names
+    tb_month = geo.harmonize_countries(df=tb_month, countries_file=paths.country_mapping_path)
+    tb_year = geo.harmonize_countries(df=tb_year, countries_file=paths.country_mapping_path)
+
+    # Add aggregates
+    tb_month = add_regions(tb_month, ds_regions)
+    tb_month = add_world(tb_month)
+    tb_year = add_regions(tb_year, ds_regions)
+    tb_year = add_world(tb_year)
+
+    # Rename columns
+    tb_year = tb_year.rename(
+        columns={
+            "date": "year",
+            "avian_cases": "avian_cases_year",
+        }
+    )
+    tb_month = tb_month.rename(
+        columns={
+            "avian_cases": "avian_cases_month",
+        }
+    )
+
+    # Set dtype to numeric
+    tb_year["year"] = tb_year["year"].astype(str).astype(int)
+
+    # Sanity check
+    assert tb_year["year"].max() == 2024
+    assert tb_year["year"].min() == 1997
+
+    # Set index
+    tb_month = tb_month.format(["country", "date"])
+    tb_year = tb_year.format(["country", "year"])
+
+    # Set short_name
+    tb_month.metadata.short_name = f"{tb_month.metadata.short_name}_month"
+    tb_year.metadata.short_name = f"{tb_year.metadata.short_name}_year"
+
+    #
+    # Save outputs.
+    #
+    tables = [
+        tb_month,
+        tb_year,
+    ]
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(dest_dir, tables=tables, default_metadata=ds_meadow.metadata)
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+
+def add_regions(tb: Table, ds_regions: Dataset) -> Table:
+    "Add regions to the table."
+    for region in REGIONS:
+        # List of countries in region.
+        countries_in_region = geo.list_members_of_region(region=region, ds_regions=ds_regions)
+
+        # Add region
+        tb_region = tb[tb["country"].isin(countries_in_region)]
+        tb_region = tb_region.assign(country=region)
+        tb_region = tb_region.groupby(["date", "country"], as_index=False, observed=True)["avian_cases"].sum()
+
+        # Combine
+        tb = pr.concat([tb, tb_region], ignore_index=True)
+
+    return tb
+
+
+def add_world(tb: Table) -> Table:
+    """Add world aggregate to the table."""
+    # Ignore regions
+    tb_world = tb[~tb["country"].isin(REGIONS)].copy()
+
+    # Aggregate
+    tb_world = tb_world.groupby("date", as_index=False, observed=True)["avian_cases"].sum()
+    tb_world = tb_world.assign(country="World")
+
+    # Combine
+    tb = pr.concat(
+        [
+            tb,
+            tb_world,
+        ],
+        ignore_index=True,
+    )
+
+    return tb
diff --git a/etl/steps/data/grapher/who/latest/avian_influenza_ah5n1.py b/etl/steps/data/grapher/who/latest/avian_influenza_ah5n1.py
@@ -0,0 +1,68 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from owid.catalog import Table
+
+from etl.helpers import PathFinder, create_dataset, grapher_checks
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("avian_influenza_ah5n1")
+
+    # Read table from garden dataset.
+    tb_month = ds_garden["avian_influenza_ah5n1_month"].reset_index()
+    tb_year = ds_garden["avian_influenza_ah5n1_year"]
+
+    #
+    # Process data.
+    #
+    # Get zeroDay as the minimum date in the dataset and set it to zeroDay
+    tb_month = add_num_days(tb_month)
+    tb_month = tb_month.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    tables = [
+        tb_month,
+        tb_year,
+    ]
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(dest_dir, tables=tables, default_metadata=ds_garden.metadata)
+
+    #
+    # Checks.
+    #
+    grapher_checks(ds_grapher)
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
+
+
+def add_num_days(tb: Table) -> Table:
+    """Add column with number of days after zero_day.
+
+    Also, drop `date` column.
+    """
+    column_indicator = "avian_cases_month"
+
+    if tb[column_indicator].metadata.display is None:
+        tb[column_indicator].metadata.display = {}
+
+    zero_day = tb["date"].min()
+    tb[column_indicator].metadata.display["yearIsDay"] = True
+    tb[column_indicator].metadata.display["zeroDay"] = zero_day.strftime("%Y-%m-%d")
+
+    # Add column with number of days after zero_day
+    tb["year"] = (tb["date"] - zero_day).dt.days
+
+    # Drop date column
+    tb = tb.drop(columns=["date"])
+
+    return tb
diff --git a/etl/steps/data/meadow/who/latest/avian_influenza_ah5n1.py b/etl/steps/data/meadow/who/latest/avian_influenza_ah5n1.py
@@ -0,0 +1,42 @@
+"""Load a snapshot and create a meadow dataset."""
+
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("avian_influenza_ah5n1.csv")
+
+    # Load data from snapshot.
+    tb = snap.read()
+
+    #
+    # Process data.
+    #
+    # Unpivot
+    tb = tb.melt(id_vars=["Range", "Month"], var_name="country", value_name="avian_cases")
+
+    # Remove unnamed
+    tb = tb[~tb["country"].str.contains("Unnamed")]
+
+    # Dtypes
+    tb["avian_cases"] = tb["avian_cases"].astype("int")
+
+    # Create a new table and ensure all columns are snake-case.
+    tb = tb.format(["range", "month", "country"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata)
+
+    # Save changes in the new garden dataset.
+    ds_meadow.save()
diff --git a/snapshots/who/latest/avian_influenza_ah5n1.csv.dvc b/snapshots/who/latest/avian_influenza_ah5n1.csv.dvc
@@ -0,0 +1,23 @@
+meta:
+  origin:
+    producer: WHO, Global Influenza Programme
+    title: Human Cases with Highly Pathogenic Avian Influenza A/H5N1
+    description: |-
+      This dataset contains all human infections with HPAI A(H5N1) bird flu virus reported to the World Health Organization (WHO), since the first human cases in 1997.
+
+      A(H5N1) bird flu viruses first emerged in southern China in 1996. Those viruses caused large poultry outbreaks in Hong Kong in 1997, which resulted in 18 human infections. The 1997 bird outbreak was controlled, but the A(H5N1) bird flu viruses were not eradicated in birds and re-surfaced in 2003 to spread widely in birds throughout Asia, and later in Africa, Europe, and the Middle East, causing poultry outbreaks and sporadic human infections. Since 2003, more than 23 countries have reported more than 880 sporadic human infections with A(H5N1) bird flu viruses to WHO.
+
+      A(H5N1) bird flu viruses that are currently circulating in wild birds and poultry in much of the world are genetically different from earlier versions of the virus and emerged to become the predominant subtype of HPAI H5 in the fall of 2021. These viruses have caused sporadic wild bird infections and poultry outbreaks in many countries, including the United States, with spillover to mammals in some countries. In contrast to previous A(H5N1) viruses, which still circulate to a lesser extent in several countries, at this time, a small number of sporadic human cases with current A(H5N1) bird flu viruses have been reported globally. However, illness in humans from all bird flu virus infections has ranged in severity from no symptoms or mild illness to severe disease that resulted in death.
+    citation_full: |-
+      Human Cases with Highly Pathogenic Avian Influenza A/H5N1. World Health Organization, Global Influenza Programme; 2024. Licence: CC BY-NC-SA 3.0 IGO. Retrieved from CDC May 23, 2024.
+    attribution_short: WHO
+    url_main: https://www.cdc.gov/flu/avianflu/chart-epi-curve-ah5n1.html
+    date_accessed: 2024-05-28
+    date_published: '2024-05-03'
+    license:
+      name: CC BY-NC-SA 3.0 IGO
+      url: https://www.who.int/about/policies/publishing/copyright
+outs:
+  - md5: bc8c65b0f8027d0ede161150c3df891d
+    size: 24329
+    path: avian_influenza_ah5n1.csv
diff --git a/snapshots/who/latest/avian_influenza_ah5n1.py b/snapshots/who/latest/avian_influenza_ah5n1.py
@@ -0,0 +1,34 @@
+"""This data is collected by the WHO, and summarised in PDF reports.
+
+CDC provides this same data but in a machine-readable format, which one can download from https://www.cdc.gov/flu/avianflu/chart-epi-curve-ah5n1.html under "Download data (CSV)".
+"""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.")
+def main(upload: bool, path_to_file: str) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"who/{SNAPSHOT_VERSION}/avian_influenza_ah5n1.csv")
+
+    # Ensure destination folder exists.
+    snap.path.parent.mkdir(exist_ok=True, parents=True)
+
+    # Copy local data file to snapshots data folder.
+    snap.path.write_bytes(Path(path_to_file).read_bytes())
+
+    # Add file to DVC and upload to S3.
+    snap.dvc_add(upload=upload)
+
+
+if __name__ == "__main__":
+    main()