Skip to content

Commit

Permalink
first implimentation of rsv (#2121)
Browse files Browse the repository at this point in the history
* first implimentation of rsv

* lint

* clean up and fix tests

* more logging and lint

* suggested changes and refactor pull_nhsn_data

* linting

* trailing comma

* fixing backup overwrite

---------

Co-authored-by: nmdefries <[email protected]>
  • Loading branch information
aysim319 and nmdefries authored Feb 14, 2025
1 parent d0a6393 commit 70abef5
Show file tree
Hide file tree
Showing 11 changed files with 753 additions and 394 deletions.
13 changes: 12 additions & 1 deletion nhsn/delphi_nhsn/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,36 @@
# column name from socrata
TOTAL_ADMISSION_COVID_COL = "totalconfc19newadm"
TOTAL_ADMISSION_FLU_COL = "totalconfflunewadm"
TOTAL_ADMISSION_RSV_COL = "totalconfrsvnewadm"
NUM_HOSP_REPORTING_COVID_COL = "totalconfc19newadmhosprep"
NUM_HOSP_REPORTING_FLU_COL = "totalconfflunewadmhosprep"

NUM_HOSP_REPORTING_RSV_COL = "totalconfrsvnewadmhosprep"
# signal name
TOTAL_ADMISSION_COVID = "confirmed_admissions_covid_ew"
TOTAL_ADMISSION_FLU = "confirmed_admissions_flu_ew"
TOTAL_ADMISSION_RSV = "confirmed_admissions_rsv_ew"
NUM_HOSP_REPORTING_COVID = "hosprep_confirmed_admissions_covid_ew"
NUM_HOSP_REPORTING_FLU = "hosprep_confirmed_admissions_flu_ew"
NUM_HOSP_REPORTING_RSV = "hosprep_confirmed_admissions_rsv_ew"

SIGNALS_MAP = {
TOTAL_ADMISSION_COVID: TOTAL_ADMISSION_COVID_COL,
TOTAL_ADMISSION_FLU: TOTAL_ADMISSION_FLU_COL,
TOTAL_ADMISSION_RSV: TOTAL_ADMISSION_RSV_COL,
NUM_HOSP_REPORTING_COVID: NUM_HOSP_REPORTING_COVID_COL,
NUM_HOSP_REPORTING_FLU: NUM_HOSP_REPORTING_FLU_COL,
NUM_HOSP_REPORTING_RSV: NUM_HOSP_REPORTING_RSV_COL,
}

TYPE_DICT = {
"timestamp": "datetime64[ns]",
"geo_id": str,
TOTAL_ADMISSION_COVID: float,
TOTAL_ADMISSION_FLU: float,
TOTAL_ADMISSION_RSV: float,
NUM_HOSP_REPORTING_COVID: float,
NUM_HOSP_REPORTING_FLU: float,
NUM_HOSP_REPORTING_RSV: float,
}

# signal mapping for secondary, preliminary source
Expand All @@ -39,15 +46,19 @@
PRELIM_SIGNALS_MAP = {
f"{TOTAL_ADMISSION_COVID}_prelim": TOTAL_ADMISSION_COVID_COL,
f"{TOTAL_ADMISSION_FLU}_prelim": TOTAL_ADMISSION_FLU_COL,
f"{TOTAL_ADMISSION_RSV}_prelim": TOTAL_ADMISSION_RSV_COL,
f"{NUM_HOSP_REPORTING_COVID}_prelim": NUM_HOSP_REPORTING_COVID_COL,
f"{NUM_HOSP_REPORTING_FLU}_prelim": NUM_HOSP_REPORTING_FLU_COL,
f"{NUM_HOSP_REPORTING_RSV}_prelim": NUM_HOSP_REPORTING_RSV_COL,
}

PRELIM_TYPE_DICT = {
"timestamp": "datetime64[ns]",
"geo_id": str,
f"{TOTAL_ADMISSION_COVID}_prelim": float,
f"{TOTAL_ADMISSION_FLU}_prelim": float,
f"{TOTAL_ADMISSION_RSV}_prelim": float,
f"{NUM_HOSP_REPORTING_COVID}_prelim": float,
f"{NUM_HOSP_REPORTING_FLU}_prelim": float,
f"{NUM_HOSP_REPORTING_RSV}_prelim": float,
}
92 changes: 21 additions & 71 deletions nhsn/delphi_nhsn/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@ def check_last_updated(socrata_token, dataset_id, logger):
def pull_data(socrata_token: str, dataset_id: str, backup_dir: str, logger):
"""Pull data from Socrata API."""
client = Socrata("data.cdc.gov", socrata_token)
logger.info("Pulling data from Socrata API")
logger.info(
f"Pulling {'main' if dataset_id == MAIN_DATASET_ID else 'preliminary'} data from Socrata API",
dataset_id=dataset_id,
)
results = []
offset = 0
limit = 50000 # maximum limit allowed by SODA 2.0
Expand All @@ -80,7 +83,8 @@ def pull_data(socrata_token: str, dataset_id: str, backup_dir: str, logger):

if results:
df = pd.DataFrame.from_records(results)
create_backup_csv(df, backup_dir, False, logger=logger)
sensor = "prelim" if dataset_id == PRELIM_DATASET_ID else None
create_backup_csv(df, backup_dir, False, sensor=sensor, logger=logger)
else:
df = pd.DataFrame()
return df
Expand Down Expand Up @@ -120,6 +124,7 @@ def pull_nhsn_data(
backup_dir: str,
custom_run: bool,
issue_date: Optional[str],
preliminary: bool = False,
logger: Optional[logging.Logger] = None,
):
"""Pull the latest NHSN hospital admission data, and conforms it into a dataset.
Expand All @@ -137,6 +142,10 @@ def pull_nhsn_data(
Directory to which to save raw backup data
custom_run: bool
Flag indicating if the current run is a patch. If so, don't save any data to disk
preliminary: bool
Flag indicating if the grabbing main or preliminary data
issue_date:
date to indicate which backup file to pull for patching
logger: Optional[logging.Logger]
logger object
Expand All @@ -145,22 +154,26 @@ def pull_nhsn_data(
pd.DataFrame
Dataframe as described above.
"""
dataset_id = PRELIM_DATASET_ID if preliminary else MAIN_DATASET_ID
# Pull data from Socrata API
df = (
pull_data(socrata_token, MAIN_DATASET_ID, backup_dir, logger)
pull_data(socrata_token, dataset_id, backup_dir, logger)
if not custom_run
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=False)
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=preliminary)
)

recently_updated = True if custom_run else check_last_updated(socrata_token, MAIN_DATASET_ID, logger)
recently_updated = True if custom_run else check_last_updated(socrata_token, dataset_id, logger)

type_dict = PRELIM_TYPE_DICT if preliminary else TYPE_DICT
keep_columns = list(type_dict.keys())
filtered_type_dict = copy.deepcopy(type_dict)

keep_columns = list(TYPE_DICT.keys())
signal_map = PRELIM_SIGNALS_MAP if preliminary else SIGNALS_MAP

if not df.empty and recently_updated:
df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"})
filtered_type_dict = copy.deepcopy(TYPE_DICT)

for signal, col_name in SIGNALS_MAP.items():
for signal, col_name in signal_map.items():
# older backups don't have certain columns
try:
df[signal] = df[col_name]
Expand All @@ -178,66 +191,3 @@ def pull_nhsn_data(
df = pd.DataFrame(columns=keep_columns)

return df


def pull_preliminary_nhsn_data(
socrata_token: str,
backup_dir: str,
custom_run: bool,
issue_date: Optional[str],
logger: Optional[logging.Logger] = None,
):
"""Pull the latest preliminary NHSN hospital admission data, and conforms it into a dataset.
The output dataset has:
- Each row corresponds to a single observation
- Each row additionally has columns for the signals in SIGNALS
Parameters
----------
socrata_token: str
My App Token for pulling the NHSN data
backup_dir: str
Directory to which to save raw backup data
custom_run: bool
Flag indicating if the current run is a patch. If so, don't save any data to disk
logger: Optional[logging.Logger]
logger object
Returns
-------
pd.DataFrame
Dataframe as described above.
"""
# Pull data from Socrata API
df = (
pull_data(socrata_token, PRELIM_DATASET_ID, backup_dir, logger)
if not custom_run
else pull_data_from_file(backup_dir, issue_date, logger, prelim_flag=True)
)

keep_columns = list(PRELIM_TYPE_DICT.keys())
recently_updated = True if custom_run else check_last_updated(socrata_token, PRELIM_DATASET_ID, logger)

if not df.empty and recently_updated:
df = df.rename(columns={"weekendingdate": "timestamp", "jurisdiction": "geo_id"})
filtered_type_dict = copy.deepcopy(PRELIM_TYPE_DICT)

for signal, col_name in PRELIM_SIGNALS_MAP.items():
try:
df[signal] = df[col_name]
except KeyError:
logger.info("column not available in data", col_name=col_name, signal=signal)
keep_columns.remove(signal)
del filtered_type_dict[signal]

df = df[keep_columns]
df = df.astype(filtered_type_dict)

df["geo_id"] = df["geo_id"].str.lower()
df.loc[df["geo_id"] == "usa", "geo_id"] = "us"
else:
df = pd.DataFrame(columns=keep_columns)

return df
7 changes: 4 additions & 3 deletions nhsn/delphi_nhsn/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from delphi_utils.export import create_export_csv

from .constants import GEOS, PRELIM_SIGNALS_MAP, SIGNALS_MAP
from .pull import pull_nhsn_data, pull_preliminary_nhsn_data
from .pull import pull_nhsn_data


def run_module(params, logger=None):
Expand Down Expand Up @@ -56,8 +56,8 @@ def run_module(params, logger=None):
export_start_date = export_start_date.strftime("%Y-%m-%d")

nhsn_df = pull_nhsn_data(socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger)
preliminary_nhsn_df = pull_preliminary_nhsn_data(
socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger
preliminary_nhsn_df = pull_nhsn_data(
socrata_token, backup_dir, custom_run=custom_run, issue_date=issue_date, logger=logger, preliminary=True
)

geo_mapper = GeoMapper()
Expand Down Expand Up @@ -92,6 +92,7 @@ def run_module(params, logger=None):

df["se"] = np.nan
df["sample_size"] = np.nan

dates = create_export_csv(
df,
geo_res=geo,
Expand Down
Binary file modified nhsn/tests/test_data/20241212.csv.gz
Binary file not shown.
Binary file modified nhsn/tests/test_data/20241212_prelim.csv.gz
Binary file not shown.
21 changes: 21 additions & 0 deletions nhsn/tests/test_data/expected_df.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
timestamp,geo_id,confirmed_admissions_covid_ew,confirmed_admissions_flu_ew,confirmed_admissions_rsv_ew,hosprep_confirmed_admissions_covid_ew,hosprep_confirmed_admissions_flu_ew,hosprep_confirmed_admissions_rsv_ew
2021-08-21,md,53.0,2.0,0.0,13.0,13.0,1.0
2021-08-21,co,852.0,0.0,,92.0,78.0,0.0
2021-08-21,us,10384.0,6049.0,84.0,5426.0,5426.0,469.0
2021-08-28,co,835.0,1.0,,92.0,78.0,0.0
2021-08-28,us,94596.0,262.0,,5391.0,4397.0,0.0
2021-09-04,co,1000.0,3.0,,92.0,78.0,0.0
2021-09-04,us,93241.0,282.0,,5392.0,4396.0,0.0
2021-09-11,co,982.0,2.0,,92.0,78.0,0.0
2021-09-11,us,88162.0,247.0,,5391.0,4377.0,0.0
2021-09-18,co,955.0,0.0,,92.0,78.0,0.0
2021-09-18,us,79169.0,261.0,,5394.0,4362.0,0.0
2021-09-25,co,993.0,0.0,,92.0,78.0,0.0
2021-09-25,us,67740.0,234.0,,5393.0,4368.0,0.0
2021-10-02,co,970.0,0.0,,92.0,78.0,0.0
2021-10-02,us,58076.0,253.0,,5395.0,4391.0,0.0
2021-10-09,co,1079.0,1.0,,92.0,78.0,0.0
2021-10-09,us,51744.0,341.0,,5396.0,4379.0,0.0
2021-10-16,co,1231.0,0.0,,92.0,78.0,0.0
2021-10-16,us,45978.0,266.0,,5394.0,4307.0,0.0
2021-10-16,region 1,45978.0,266.0,,5394.0,4307.0,0.0
20 changes: 20 additions & 0 deletions nhsn/tests/test_data/expected_df_prelim.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
timestamp,geo_id,confirmed_admissions_covid_ew_prelim,confirmed_admissions_flu_ew_prelim,confirmed_admissions_rsv_ew_prelim,hosprep_confirmed_admissions_covid_ew_prelim,hosprep_confirmed_admissions_flu_ew_prelim,hosprep_confirmed_admissions_rsv_ew_prelim
2021-08-21,mi,269.0,523.0,1.0,152.0,152.0,4.0
2021-08-21,co,852.0,0.0,,92.0,78.0,0.0
2021-08-21,us,8946.0,5576.0,61.0,5422.0,5422.0,485.0
2021-08-28,co,835.0,1.0,,92.0,78.0,0.0
2021-08-28,us,94596.0,262.0,,5391.0,4397.0,0.0
2021-09-04,co,1000.0,3.0,,92.0,78.0,0.0
2021-09-04,us,93241.0,282.0,,5392.0,4396.0,0.0
2021-09-11,co,982.0,2.0,,92.0,78.0,0.0
2021-09-11,us,88162.0,247.0,,5391.0,4377.0,0.0
2021-09-18,co,955.0,0.0,,92.0,78.0,0.0
2021-09-18,us,79169.0,261.0,,5394.0,4362.0,0.0
2021-09-25,co,993.0,0.0,,92.0,78.0,0.0
2021-09-25,us,67740.0,234.0,,5393.0,4368.0,0.0
2021-10-02,co,970.0,0.0,,92.0,78.0,0.0
2021-10-02,us,58076.0,253.0,,5395.0,4391.0,0.0
2021-10-09,co,1079.0,1.0,,92.0,78.0,0.0
2021-10-09,us,51744.0,341.0,,5396.0,4379.0,0.0
2021-10-16,co,1231.0,0.0,,92.0,78.0,0.0
2021-10-16,us,45978.0,266.0,,5394.0,4307.0,0.0
Loading

0 comments on commit 70abef5

Please sign in to comment.