add progression ratio

owid · Nov 25, 2024 · 7e1975b · 7e1975b
1 parent 087d82b
commit 7e1975b
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 12 deletions.
diff --git a/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml b/etl/steps/data/garden/hmd/2024-11-19/hfd.meta.yml
@@ -13,6 +13,16 @@ definitions:
       <%- elif birth_order == '5p' %>
        fifth (or greater)
       <% endif %>
+    bo_1_m1: |-
+      <% if birth_order == '2' %>
+       second
+      <%- elif birth_order == '3' %>
+       third
+      <%- elif birth_order == '4' %>
+       fourth
+      <%- elif birth_order == '5p' %>
+       fifth (or greater)
+      <% endif %>
     title: |-
       <% if birth_order == 'total' %>
       << title >> - Total
@@ -284,3 +294,21 @@ tables:
         description_key:
           - Reflects variability in the timing of births up to age 40 within a cohort.
           - Helps to understand how concentrated or spread out early childbearing is within the cohort.
+
+      ppr:
+        title: |-
+          Cohort parity progression ratios - << int(birt_order) >> to << int(birth_order) + 1 >> birth
+        description_short: |-
+          <% if birth_order == '1' %>
+          Probability of giving birth to a first child.
+          <%- elif birth_order == '2' %>
+          Probability of giving birth to a second child, conditioned on having had a first child.
+          <%- elif birth_order == '3' %>
+          Probability of giving birth to a third child, conditioned on having had a second child.
+          <%- elif birth_order == '4' %>
+          Probability of giving birth to a fourth child, conditioned on having had a third child.
+          <% endif %>
+        unit: ""
+        description_key:
+          - Measures the likelihood that a woman with a given number of children will go on to have another child.
+          - It is useful for understanding family-building dynamics and changes in reproductive behavior over time.
diff --git a/etl/steps/data/garden/hmd/2024-11-19/hfd.py b/etl/steps/data/garden/hmd/2024-11-19/hfd.py
@@ -410,17 +410,24 @@
     },
     # "pmabc",
 }
+REGEX_PERIOD_BO = {}
+
 # Tables to process for COHORT country-cohort
 TABLES_COHORT = [
     "mabvh",
-    # "pprvhbo",
+    "pprvhbo",
     "sdmabvh",
     "tfrvh",
 ]
 TABLES_COHORT_W_PARITY = {
-    # "pprvhbo": {
-    #     "indicators": ["patfr"],
-    # },
+    "pprvhbo": {
+        "indicators": ["ppr"],
+    },
+}
+REGEX_COHORT_BO = {
+    "pprvhbo": {
+        "ppr": r"^ppr\d+_\d+$",
+    },
 }
 
 
@@ -453,16 +460,19 @@ def integrate_bo(tb, tb_bo, col_index, core_indicators):
     return tb
 
 
-def make_table_with_birth_order(tb, col_index, core_indicators, col_bo="birth_order"):
+def make_table_with_birth_order(tb, col_index, core_indicators, col_bo="birth_order", regex_bo=None):
     """Change the format of a table from wide to long, to incorporate the birth order as a dimension."""
 
+    if regex_bo is None:
+        regex_bo = {}
+
     def _generate_regex(name):
         if re.search(r"\d$", name):  # Check if the name ends with a number
             return rf"^{name}_?(\d+|(\d+p)?)$"
         else:
             return rf"^{name}(\d+|(\d+p)?)$"
 
-    regex_patterns = {name: _generate_regex(name) for name in core_indicators}
+    regex_patterns = {name: regex_bo.get(name, _generate_regex(name)) for name in core_indicators}
 
     tb = tb.melt(
         col_index,
@@ -529,16 +539,22 @@ def read_table(ds_meadow, tname, tname_base=None):
     return tb
 
 
-def make_table_list(ds_meadow, table_names, tables_w_parity, cols_index, col_bo):
+def make_table_list(ds_meadow, table_names, tables_w_parity, cols_index, col_bo, regex_bo=None):
     """Reads relevant tables, and formats them accordingly.
 
     Tables come in wide format, sometimes as two-tables (main and birth order). This function consolidates them into single tables per topic.
 
     For instance, we have one table with total fertility rates (columns `tfr`). And then another one with fertilities broken down by birth order (columns `tfr`, `tfr1`, etc.)
     Instead, we want a table in long format, which has one column `tfr` and adds the birth order as a dimension of the table.
     """
+    if regex_bo is None:
+        regex_bo = {}
+
     tbs = []
     for tname in table_names:
+        # Get custom regex for this table
+        regex = regex_bo.get(tname)
+
         # Read main table
         tb = read_table(ds_meadow, tname)
 
@@ -552,11 +568,11 @@ def make_table_list(ds_meadow, table_names, tables_w_parity, cols_index, col_bo)
             # Add BO to main table
             tb = integrate_bo(tb, tb_bo, cols_index, core_indicators)
             # Consolidate table: Use long format, and add birth_order as a dimension of the main table.
-            tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo)
+            tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo, regex)
         # Sometimes, the main table contains already indicators broken down by birth order. In such cases, we also need to reshape the table.
         elif tname in tables_w_parity:
-            core_indicators = cols_index + tables_w_parity[tname]["indicators"]
-            tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo)
+            core_indicators = tables_w_parity[tname]["indicators"]
+            tb = make_table_with_birth_order(tb, cols_index, core_indicators, col_bo, regex)
 
         # Add formatted table to the list of tables.
         tbs.append(tb)
@@ -577,7 +593,14 @@ def run(dest_dir: str) -> None:
     col_bo = "birth_order"
     cols_index_out = cols_index + [col_bo]
     ## Read tables
-    tbs = make_table_list(ds_meadow, TABLES_PERIOD, TABLES_PERIOD_W_PARITY, cols_index, col_bo)
+    tbs = make_table_list(
+        ds_meadow=ds_meadow,
+        table_names=TABLES_PERIOD,
+        tables_w_parity=TABLES_PERIOD_W_PARITY,
+        cols_index=cols_index,
+        col_bo=col_bo,
+        regex_bo=REGEX_PERIOD_BO,
+    )
     ## Sanity check: no column is named the same
     colnames = [col for t in tbs for col in t.columns if col not in cols_index_out]
     assert len(colnames) == len(set(colnames)), "Some columns are named the same!"
@@ -591,10 +614,21 @@ def run(dest_dir: str) -> None:
     col_bo = "birth_order"
     cols_index_out = cols_index + [col_bo]
     ## Read tables
-    tbs = make_table_list(ds_meadow, TABLES_COHORT, TABLES_COHORT_W_PARITY, cols_index, col_bo)
+    tbs = make_table_list(
+        ds_meadow=ds_meadow,
+        table_names=TABLES_COHORT,
+        tables_w_parity=TABLES_COHORT_W_PARITY,
+        cols_index=cols_index,
+        col_bo=col_bo,
+        regex_bo=REGEX_COHORT_BO,
+    )
     ## Sanity check: no column is named the same
     colnames = [col for t in tbs for col in t.columns if col not in cols_index_out]
     assert len(colnames) == len(set(colnames)), "Some columns are named the same!"
+    # Quick fix: change birth_order label for PPR
+    for tb in tbs:
+        if tb.m.short_name == "pprvhbo":
+            tb["birth_order"] = tb["birth_order"].str.split("_").str[-1]
     ## Merge
     tb_cohort = pr.multi_merge(tbs, on=cols_index_out, how="outer")
     tb_cohort = tb_cohort.format(cols_index_out, short_name="cohort")