Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 Pertussis cases - CDC #4068

Merged
merged 7 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1140,3 +1140,14 @@ steps:
- data://garden/who/2025-01-17/mortality_database_vaccine_preventable
data://grapher/health/2025-03-04/diphtheria_deaths:
- data://garden/health/2025-03-04/diphtheria_deaths

#
# US Pertussis cases - CDC
#
data://meadow/cdc/2025-03-04/pertussis_cases:
- snapshot://cdc/2025-03-04/pertussis_cases.csv
data://garden/cdc/2025-03-04/pertussis_cases:
- data://meadow/cdc/2025-03-04/pertussis_cases
- data://garden/demography/2024-07-15/population
data://grapher/cdc/2025-03-04/pertussis_cases:
- data://garden/cdc/2025-03-04/pertussis_cases
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"United States": "United States"
}
27 changes: 27 additions & 0 deletions etl/steps/data/garden/cdc/2025-03-04/pertussis_cases.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Vaccination

# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365

tables:
pertussis_cases:
variables:
cases:
title: Pertussis cases
unit: cases
description_short: Reported number of [pertussis](#dod:pertussis) cases in the United States.
display:
numDecimalPlaces: 0
case_rate:
title: Pertussis case per 100,000 people
unit: cases per 100,000 people
description_short: Reported number of [pertussis](#dod:pertussis) deaths in the United States per million people.
display:
numDecimalPlaces: 1
42 changes: 42 additions & 0 deletions etl/steps/data/garden/cdc/2025-03-04/pertussis_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Load a meadow dataset and create a garden dataset."""

from owid.catalog import processing as pr

from etl.data_helpers import geo
from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("pertussis_cases")
ds_population = paths.load_dataset("population")
tb_pop = ds_population.read("population", reset_metadata="keep_origins")
# Read table from meadow dataset.
tb = ds_meadow.read("pertussis_cases")

tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
tb = pr.merge(
tb,
tb_pop,
on=["country", "year"],
how="left",
)
tb["case_rate"] = tb["cases"] / tb["population"] * 100000

tb = tb.drop(columns=["population", "source", "world_pop_share"])
tb = tb.format(["country", "year"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)

# Save changes in the new garden dataset.
ds_garden.save()
26 changes: 26 additions & 0 deletions etl/steps/data/grapher/cdc/2025-03-04/pertussis_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset("pertussis_cases")

# Read table from garden dataset.
tb = ds_garden.read("pertussis_cases", reset_index=False)

#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)

# Save changes in the new grapher dataset.
ds_grapher.save()
37 changes: 37 additions & 0 deletions etl/steps/data/meadow/cdc/2025-03-04/pertussis_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Load a snapshot and create a meadow dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap = paths.load_snapshot("pertussis_cases.csv")

# Load data from snapshot.
tb = snap.read()

#
# Process data.
#
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tables = [tb.format(["country", "year"])]

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(
dest_dir,
tables=tables,
check_variables_metadata=True,
default_metadata=snap.metadata,
)

# Save changes in the new meadow dataset.
ds_meadow.save()
27 changes: 27 additions & 0 deletions snapshots/cdc/2025-03-04/pertussis_cases.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: Historical pertussis cases in the United States - Centers for Disease Control and Prevention
description: |-
Data on historical pertussisa cases in the United States from 1922 onwards, as reported by the Centers for Disease Control and Prevention. The data is sourced from the CDC's Summary of Notifiable Diseases, which is published annually.

date_published: "2023"
# Citation
producer: Centers for Disease Control and Prevention
citation_full: |2-

Pertussis Cases by Year (1922-2022), Centers for Disease Control and Prevention, United States (2023).

# Files
url_main: https://www.cdc.gov/pertussis/php/surveillance/pertussis-cases-by-year.html
date_accessed: 2025-03-04

# License
license:
name: Public domain
outs:
- md5: 8a211f9651c5a8163b214056d43ee09c
size: 2530
path: pertussis_cases.csv
59 changes: 59 additions & 0 deletions snapshots/cdc/2025-03-04/pertussis_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Script to create a snapshot of dataset."""

from pathlib import Path

import click
import pandas as pd
import requests
from bs4 import BeautifulSoup

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"cdc/{SNAPSHOT_VERSION}/pertussis_cases.csv")
df = get_data()
# Download data from source, add file to DVC and upload to S3.
snap.create_snapshot(upload=upload, data=df)


def get_data() -> pd.DataFrame:
url = "https://www.cdc.gov/pertussis/php/surveillance/pertussis-cases-by-year.html"
# Scrape data from the CDC website.
# Fetch the webpage content
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Locate the table
table = soup.find(
"table", {"class": "table table-bordered show-more-div-234 table-striped main|gray-l4 nein-scroll"}
)

# Extract table headers
headers = [th.text.strip() for th in table.find("thead").find_all("th")]

# Extract table rows
data = []
for row in table.find("tbody").find_all("tr"):
columns = row.find_all("td")
if columns:
year = row.find("th").text.strip() # Year is in <th>
cases = columns[0].text.strip().replace(",", "") # Remove commas from numbers
data.append([year, int(cases)])

# Create a DataFrame
df = pd.DataFrame(data, columns=headers)
df.columns = ["year", "cases"]
df["country"] = "United States"

return df


if __name__ == "__main__":
main()