owid · spoonerf · Mar 5, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/dag/health.yml b/dag/health.yml
@@ -1140,3 +1140,14 @@ steps:
     - data://garden/who/2025-01-17/mortality_database_vaccine_preventable
   data://grapher/health/2025-03-04/diphtheria_deaths:
     - data://garden/health/2025-03-04/diphtheria_deaths
+
+  #
+  # US Pertussis cases - CDC
+  #
+  data://meadow/cdc/2025-03-04/pertussis_cases:
+    - snapshot://cdc/2025-03-04/pertussis_cases.csv
+  data://garden/cdc/2025-03-04/pertussis_cases:
+    - data://meadow/cdc/2025-03-04/pertussis_cases
+    - data://garden/demography/2024-07-15/population
+  data://grapher/cdc/2025-03-04/pertussis_cases:
+    - data://garden/cdc/2025-03-04/pertussis_cases
diff --git a/etl/steps/data/garden/cdc/2025-03-04/pertussis_cases.countries.json b/etl/steps/data/garden/cdc/2025-03-04/pertussis_cases.countries.json
@@ -0,0 +1,3 @@
+{
+  "United States": "United States"
+}
diff --git a/etl/steps/data/garden/cdc/2025-03-04/pertussis_cases.meta.yml b/etl/steps/data/garden/cdc/2025-03-04/pertussis_cases.meta.yml
@@ -0,0 +1,27 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Vaccination
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+
+tables:
+  pertussis_cases:
+    variables:
+      cases:
+        title: Pertussis cases
+        unit: cases
+        description_short: Reported number of [pertussis](#dod:pertussis) cases in the United States.
+        display:
+          numDecimalPlaces: 0
+      case_rate:
+        title: Pertussis case per 100,000 people
+        unit: cases per 100,000 people
+        description_short: Reported number of [pertussis](#dod:pertussis) deaths in the United States per million people.
+        display:
+          numDecimalPlaces: 1
diff --git a/etl/steps/data/garden/cdc/2025-03-04/pertussis_cases.py b/etl/steps/data/garden/cdc/2025-03-04/pertussis_cases.py
@@ -0,0 +1,42 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from owid.catalog import processing as pr
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("pertussis_cases")
+    ds_population = paths.load_dataset("population")
+    tb_pop = ds_population.read("population", reset_metadata="keep_origins")
+    # Read table from meadow dataset.
+    tb = ds_meadow.read("pertussis_cases")
+
+    tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
+    tb = pr.merge(
+        tb,
+        tb_pop,
+        on=["country", "year"],
+        how="left",
+    )
+    tb["case_rate"] = tb["cases"] / tb["population"] * 100000
+
+    tb = tb.drop(columns=["population", "source", "world_pop_share"])
+    tb = tb.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/cdc/2025-03-04/pertussis_cases.py b/etl/steps/data/grapher/cdc/2025-03-04/pertussis_cases.py
@@ -0,0 +1,26 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("pertussis_cases")
+
+    # Read table from garden dataset.
+    tb = ds_garden.read("pertussis_cases", reset_index=False)
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/cdc/2025-03-04/pertussis_cases.py b/etl/steps/data/meadow/cdc/2025-03-04/pertussis_cases.py
@@ -0,0 +1,37 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("pertussis_cases.csv")
+
+    # Load data from snapshot.
+    tb = snap.read()
+
+    #
+    # Process data.
+    #
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tables = [tb.format(["country", "year"])]
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(
+        dest_dir,
+        tables=tables,
+        check_variables_metadata=True,
+        default_metadata=snap.metadata,
+    )
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/snapshots/cdc/2025-03-04/pertussis_cases.csv.dvc b/snapshots/cdc/2025-03-04/pertussis_cases.csv.dvc
@@ -0,0 +1,27 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: Historical pertussis cases in the United States - Centers for Disease Control and Prevention
+    description: |-
+      Data on historical pertussisa cases in the United States from 1922 onwards, as reported by the Centers for Disease Control and Prevention. The data is sourced from the CDC's Summary of Notifiable Diseases, which is published annually.
+
+    date_published: "2023"
+    # Citation
+    producer: Centers for Disease Control and Prevention
+    citation_full: |2-
+
+      Pertussis Cases by Year (1922-2022), Centers for Disease Control and Prevention, United States (2023).
+
+    # Files
+    url_main: https://www.cdc.gov/pertussis/php/surveillance/pertussis-cases-by-year.html
+    date_accessed: 2025-03-04
+
+    # License
+    license:
+      name: Public domain
+outs:
+  - md5: 8a211f9651c5a8163b214056d43ee09c
+    size: 2530
+    path: pertussis_cases.csv
diff --git a/snapshots/cdc/2025-03-04/pertussis_cases.py b/snapshots/cdc/2025-03-04/pertussis_cases.py
@@ -0,0 +1,59 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"cdc/{SNAPSHOT_VERSION}/pertussis_cases.csv")
+    df = get_data()
+    # Download data from source, add file to DVC and upload to S3.
+    snap.create_snapshot(upload=upload, data=df)
+
+
+def get_data() -> pd.DataFrame:
+    url = "https://www.cdc.gov/pertussis/php/surveillance/pertussis-cases-by-year.html"
+    # Scrape data from the CDC website.
+    # Fetch the webpage content
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # Locate the table
+    table = soup.find(
+        "table", {"class": "table table-bordered show-more-div-234 table-striped main|gray-l4 nein-scroll"}
+    )
+
+    # Extract table headers
+    headers = [th.text.strip() for th in table.find("thead").find_all("th")]
+
+    # Extract table rows
+    data = []
+    for row in table.find("tbody").find_all("tr"):
+        columns = row.find_all("td")
+        if columns:
+            year = row.find("th").text.strip()  # Year is in <th>
+            cases = columns[0].text.strip().replace(",", "")  # Remove commas from numbers
+            data.append([year, int(cases)])
+
+    # Create a DataFrame
+    df = pd.DataFrame(data, columns=headers)
+    df.columns = ["year", "cases"]
+    df["country"] = "United States"
+
+    return df
+
+
+if __name__ == "__main__":
+    main()