adding age-groups to shared

owid · May 28, 2024 · cb70490 · cb70490
1 parent e50da5d
commit cb70490
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 21 deletions.
diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_mental_health.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_mental_health.py
@@ -1,10 +1,32 @@
 """Load a meadow dataset and create a garden dataset."""
 
+from shared import add_regional_aggregates, add_share_population
+
 from etl.data_helpers import geo
 from etl.helpers import PathFinder, create_dataset
 
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
+REGIONS = ["North America", "South America", "Europe", "Africa", "Asia", "Oceania"]
+AGE_GROUPS_RANGES = {
+    "All ages": [0, None],
+    "<5 years": [0, 4],
+    "5-14 years": [5, 14],
+    "15-19 years": [15, 19],
+    "15-49 years": [15, 49],
+    "20-24 years": [20, 24],
+    "25-29 years": [25, 29],
+    "30-34 years": [30, 34],
+    "35-39 years": [35, 39],
+    "40-44 years": [40, 44],
+    "45-49 years": [45, 49],
+    "50-54 years": [50, 54],
+    "50-69 years": [50, 69],
+    "55-59 years": [55, 59],
+    "60-64 years": [60, 64],
+    "65-69 years": [65, 69],
+    "70+ years": [70, None],
+}
 
 
 def run(dest_dir: str) -> None:
@@ -13,10 +35,20 @@ def run(dest_dir: str) -> None:
     #
     # Load meadow dataset.
     ds_meadow = paths.load_dataset("gbd_mental_health")
-
+    # Load regions dataset.
+    ds_regions = paths.load_dataset("regions")
     # Read table from meadow dataset.
     tb = ds_meadow["gbd_mental_health"].reset_index()
 
+    tb = add_regional_aggregates(
+        tb,
+        ds_regions,
+        index_cols=["country", "year", "metric", "cause", "age", "sex"],
+        regions=REGIONS,
+        age_group_mapping=AGE_GROUPS_RANGES,
+    )
+    # Add a share of the population column
+    tb = add_share_population(tb)
     #
     # Process data.
     #

diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_prevalence.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_prevalence.py
@@ -1,8 +1,6 @@
 """Load a meadow dataset and create a garden dataset."""
 
-from owid.catalog import Table
-from owid.catalog import processing as pr
-from shared import add_regional_aggregates
+from shared import add_regional_aggregates, add_share_population
 
 from etl.data_helpers import geo
 from etl.helpers import PathFinder, create_dataset
@@ -72,16 +70,3 @@ def run(dest_dir: str) -> None:
 
     # Save changes in the new garden dataset.
     ds_garden.save()
-
-
-def add_share_population(tb: Table) -> Table:
-    """
-    Add a share of the population column to the table.
-    The 'Rate' column is the number of cases per 100,000 people, we want the equivalent per 100 people.
-    """
-    tb_share = tb[tb["metric"] == "Rate"].copy()
-    tb_share["metric"] = "Share"
-    tb_share["value"] = tb_share["value"] / 1000
-
-    tb = pr.concat([tb, tb_share], ignore_index=True)
-    return tb
diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/shared.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/shared.py
@@ -22,10 +22,24 @@ def add_regional_aggregates(
     tb_number = tb[tb["metric"].isin(["Number", "Percent"])].copy()
     tb_rate = tb[tb["metric"] == "Rate"].copy()
     tb_percent = tb[tb["metric"] == "Percent"].copy()
-    # Add population data
-    tb_number = add_population(
-        df=tb_number, country_col="country", year_col="year", age_col="age", age_group_mapping=age_group_mapping
-    )
+    # Add population data - some datasets will have data disaggregated by sex
+    if "sex" in tb.columns:
+        tb_number = add_population(
+            df=tb_number,
+            country_col="country",
+            year_col="year",
+            age_col="age",
+            age_group_mapping=age_group_mapping,
+            sex_col="sex",
+            sex_group_all="Both",
+            sex_group_female="Female",
+            sex_group_male="Male",
+        )
+    else:
+        tb_number = add_population(
+            df=tb_number, country_col="country", year_col="year", age_col="age", age_group_mapping=age_group_mapping
+        )
+    assert tb_number["value"].notna().all(), "Values are missing in the Number table, check configuration"
     # Combine Number and Percent tables
     tb_number_percent = pr.concat([tb_number, tb_percent], ignore_index=True)
     # Add region aggregates - for Number and Percent (if present)
@@ -51,3 +65,16 @@ def add_regional_aggregates(
     )
     tb_out = tb_out.drop(columns="population")
     return tb_out
+
+
+def add_share_population(tb: Table) -> Table:
+    """
+    Add a share of the population column to the table.
+    The 'Rate' column is the number of cases per 100,000 people, we want the equivalent per 100 people.
+    """
+    tb_share = tb[tb["metric"] == "Rate"].copy()
+    tb_share["metric"] = "Share"
+    tb_share["value"] = tb_share["value"] / 1000
+
+    tb = pr.concat([tb, tb_share], ignore_index=True)
+    return tb