From cb704904c2b17c0c10f4cfa84d5d9ff7817ea58e Mon Sep 17 00:00:00 2001 From: spoonerf Date: Tue, 28 May 2024 10:46:41 +0100 Subject: [PATCH] adding age-groups to shared --- .../ihme_gbd/2024-05-20/gbd_mental_health.py | 34 +++++++++++++++++- .../ihme_gbd/2024-05-20/gbd_prevalence.py | 17 +-------- .../data/garden/ihme_gbd/2024-05-20/shared.py | 35 ++++++++++++++++--- 3 files changed, 65 insertions(+), 21 deletions(-) diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_mental_health.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_mental_health.py index 77dd4d476eb..0c5abb39720 100644 --- a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_mental_health.py +++ b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_mental_health.py @@ -1,10 +1,32 @@ """Load a meadow dataset and create a garden dataset.""" +from shared import add_regional_aggregates, add_share_population + from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. paths = PathFinder(__file__) +REGIONS = ["North America", "South America", "Europe", "Africa", "Asia", "Oceania"] +AGE_GROUPS_RANGES = { + "All ages": [0, None], + "<5 years": [0, 4], + "5-14 years": [5, 14], + "15-19 years": [15, 19], + "15-49 years": [15, 49], + "20-24 years": [20, 24], + "25-29 years": [25, 29], + "30-34 years": [30, 34], + "35-39 years": [35, 39], + "40-44 years": [40, 44], + "45-49 years": [45, 49], + "50-54 years": [50, 54], + "50-69 years": [50, 69], + "55-59 years": [55, 59], + "60-64 years": [60, 64], + "65-69 years": [65, 69], + "70+ years": [70, None], +} def run(dest_dir: str) -> None: @@ -13,10 +35,20 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("gbd_mental_health") - + # Load regions dataset. + ds_regions = paths.load_dataset("regions") # Read table from meadow dataset. tb = ds_meadow["gbd_mental_health"].reset_index() + tb = add_regional_aggregates( + tb, + ds_regions, + index_cols=["country", "year", "metric", "cause", "age", "sex"], + regions=REGIONS, + age_group_mapping=AGE_GROUPS_RANGES, + ) + # Add a share of the population column + tb = add_share_population(tb) # # Process data. # diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_prevalence.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_prevalence.py index f2de6a025a3..54e11893ce4 100644 --- a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_prevalence.py +++ b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_prevalence.py @@ -1,8 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" -from owid.catalog import Table -from owid.catalog import processing as pr -from shared import add_regional_aggregates +from shared import add_regional_aggregates, add_share_population from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -72,16 +70,3 @@ def run(dest_dir: str) -> None: # Save changes in the new garden dataset. ds_garden.save() - - -def add_share_population(tb: Table) -> Table: - """ - Add a share of the population column to the table. - The 'Rate' column is the number of cases per 100,000 people, we want the equivalent per 100 people. - """ - tb_share = tb[tb["metric"] == "Rate"].copy() - tb_share["metric"] = "Share" - tb_share["value"] = tb_share["value"] / 1000 - - tb = pr.concat([tb, tb_share], ignore_index=True) - return tb diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/shared.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/shared.py index a611f46323e..be4aa028b05 100644 --- a/etl/steps/data/garden/ihme_gbd/2024-05-20/shared.py +++ b/etl/steps/data/garden/ihme_gbd/2024-05-20/shared.py @@ -22,10 +22,24 @@ def add_regional_aggregates( tb_number = tb[tb["metric"].isin(["Number", "Percent"])].copy() tb_rate = tb[tb["metric"] == "Rate"].copy() tb_percent = tb[tb["metric"] == "Percent"].copy() - # Add population data - tb_number = add_population( - df=tb_number, country_col="country", year_col="year", age_col="age", age_group_mapping=age_group_mapping - ) + # Add population data - some datasets will have data disaggregated by sex + if "sex" in tb.columns: + tb_number = add_population( + df=tb_number, + country_col="country", + year_col="year", + age_col="age", + age_group_mapping=age_group_mapping, + sex_col="sex", + sex_group_all="Both", + sex_group_female="Female", + sex_group_male="Male", + ) + else: + tb_number = add_population( + df=tb_number, country_col="country", year_col="year", age_col="age", age_group_mapping=age_group_mapping + ) + assert tb_number["value"].notna().all(), "Values are missing in the Number table, check configuration" # Combine Number and Percent tables tb_number_percent = pr.concat([tb_number, tb_percent], ignore_index=True) # Add region aggregates - for Number and Percent (if present) @@ -51,3 +65,16 @@ def add_regional_aggregates( ) tb_out = tb_out.drop(columns="population") return tb_out + + +def add_share_population(tb: Table) -> Table: + """ + Add a share of the population column to the table. + The 'Rate' column is the number of cases per 100,000 people, we want the equivalent per 100 people. + """ + tb_share = tb[tb["metric"] == "Rate"].copy() + tb_share["metric"] = "Share" + tb_share["value"] = tb_share["value"] / 1000 + + tb = pr.concat([tb, tb_share], ignore_index=True) + return tb