working on shared

owid · May 24, 2024 · cd27482 · cd27482
1 parent b698b68
commit cd27482
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 10 deletions.
diff --git a/dag/health.yml b/dag/health.yml
@@ -615,6 +615,7 @@ steps:
     - snapshot-private://ihme_gbd/2024-05-20/gbd_cause.csv
   data-private://garden/ihme_gbd/2024-05-20/gbd_cause:
     - data-private://meadow/ihme_gbd/2024-05-20/gbd_cause
+    - data://garden/regions/2023-01-01/regions
   data-private://grapher/ihme_gbd/2024-05-20/gbd_cause:
     - data-private://garden/ihme_gbd/2024-05-20/gbd_cause
 
@@ -623,6 +624,7 @@ steps:
     - snapshot-private://ihme_gbd/2024-05-20/gbd_prevalence.csv
   data-private://garden/ihme_gbd/2024-05-20/gbd_prevalence:
     - data-private://meadow/ihme_gbd/2024-05-20/gbd_prevalence
+    - data://garden/regions/2023-01-01/regions
   data-private://grapher/ihme_gbd/2024-05-20/gbd_prevalence:
     - data-private://garden/ihme_gbd/2024-05-20/gbd_prevalence
 
@@ -631,5 +633,6 @@ steps:
     - snapshot-private://ihme_gbd/2024-05-20/gbd_mental_health.csv
   data-private://garden/ihme_gbd/2024-05-20/gbd_mental_health:
     - data-private://meadow/ihme_gbd/2024-05-20/gbd_mental_health
+    - data://garden/regions/2023-01-01/regions
   data-private://grapher/ihme_gbd/2024-05-20/gbd_mental_health:
     - data-private://garden/ihme_gbd/2024-05-20/gbd_mental_health
diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_cause.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/gbd_cause.py
@@ -1,10 +1,21 @@
 """Load a meadow dataset and create a garden dataset."""
 
+from shared import add_regional_aggregates
+
 from etl.data_helpers import geo
 from etl.helpers import PathFinder, create_dataset
 
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
+REGIONS = ["North America", "South America", "Europe", "Africa", "Asia", "Oceania"]
+AGE_GROUPS_RANGES = {
+    "All ages": [0, None],
+    "<5 years": [0, 4],
+    "5-14 years": [5, 14],
+    "15-49 years": [15, 49],
+    "50-69 years": [50, 69],
+    "70+ years": [70, None],
+}
 
 
 def run(dest_dir: str) -> None:
@@ -16,11 +27,20 @@ def run(dest_dir: str) -> None:
 
     # Read table from meadow dataset.
     tb = ds_meadow["gbd_cause"].reset_index()
-
+    ds_regions = paths.load_dataset("regions")
     #
     # Process data.
     #
     tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
+    # Add regional aggregates
+    tb = add_regional_aggregates(
+        tb=tb,
+        ds_regions=ds_regions,
+        index_cols=["country", "year", "metric", "measure", "cause", "age"],
+        regions=REGIONS,
+        age_group_mapping=AGE_GROUPS_RANGES,
+    )
+
     # Split into two tables: one for deaths, one for DALYs
     tb_deaths = tb[tb["measure"] == "Deaths"].copy()
     tb_dalys = tb[tb["measure"] == "DALYs (Disability-Adjusted Life Years)"].copy()

diff --git a/etl/steps/data/garden/ihme_gbd/2024-05-20/shared.py b/etl/steps/data/garden/ihme_gbd/2024-05-20/shared.py
@@ -13,24 +13,31 @@ def add_regional_aggregates(
     """
     Adding the regional aggregated data for the OWID continent regions
     """
-    # Add population data
-    tb = add_population(
-        df=tb, country_col="country", year_col="year", age_col="age", age_group_mapping=age_group_mapping
+    tb_number = tb[tb["metric"].isin(["Number", "Percent"])].copy()
+    tb_number = add_population(
+        df=tb_number, country_col="country", year_col="year", age_col="age", age_group_mapping=age_group_mapping
     )
-    tb_number = tb[tb["metric"] == "Number"].copy()
     tb_rate = tb[tb["metric"] == "Rate"].copy()
-    # Add region aggregates.
-    tb_number = geo.add_regions_to_table(
-        tb_number,
+    tb_percent = tb[tb["metric"] == "Percent"].copy()
+    tb_number_percent = pr.concat([tb_number, tb_percent], ignore_index=True)
+    # Add region aggregates - for Number
+    tb_number_percent = geo.add_regions_to_table(
+        tb_number_percent,
         index_columns=index_cols,
         regions=regions,
         ds_regions=ds_regions,
         min_num_values_per_year=1,
     )
+    # Add region aggregates - for Rate - only need population here?
+
     tb_rate_regions = tb_number[tb_number["country"].isin(regions)].copy()
     tb_rate_regions["value"] = tb_number["value"] / tb_number["population"] * 100_000
     tb_rate_regions["metric"] = "Rate"
+    tb_rate = pr.concat([tb_rate, tb_rate_regions], ignore_index=True)
+    tb_rate = tb_rate.drop(columns="population")
 
-    tb_out = pr.concat([tb_number, tb_rate, tb_rate_regions], ignore_index=True)
-    tb_out = tb_out.drop(columns=["population"])
+    tb_out = pr.concat(
+        [tb_number_percent, tb_rate],
+        ignore_index=True,
+    )
     return tb_out