From f25bf83417a993e3b2d2817e3488595f8331fc8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lucas=20Rod=C3=A9s-Guirao?=
 <lucasrodes@users.noreply.github.com>
Date: Wed, 4 Dec 2024 16:01:27 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20birth=20rate=20in=20HMD=20(#3690?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 📊 closer look at population estimates by HMD

* wip

* fix birth rate estimation

* drop column
---
 etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml  | 14 +++++++++++---
 etl/steps/data/garden/hmd/2024-12-01/hmd.py        | 13 ++++++++++++-
 .../data/garden/hmd/2024-12-03/hmd_country.py      |  3 ++-
 etl/steps/data/grapher/hmd/2024-12-01/hmd.py       |  1 +
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml
index 4aab98dfd83..d846e9b485c 100644
--- a/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml
+++ b/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml
@@ -10,7 +10,7 @@ definitions:
     display_name_dim: |-
       at << 'birth' if (age == '0') else age >><< ', ' + sex + 's' if (sex != 'total') >>, << type >>
     title_public_dim: |-
-      at << age if age != '0' else 'birth'>>
+      <% if age != 'total' %>at << age if age != '0' else 'birth'>><% endif %>
   global:
     life_expectancy:
       point_1: |-
@@ -251,28 +251,36 @@ tables:
       presentation:
         topic_tags:
           - Population Growth
-        title_variant: << sex + 's, ' if sex != 'total' >>
 
     variables:
       population:
         title: Population
         unit: people
         description_short: |-
+          <% if age == 'total' %>
+          <%- if sex == 'total' %>
+          The total number of people living in a country.
+          <%- else %>
+          The total number of << sex + 's' >> living in a country.
+          <%- endif %>
+          <%- else %>
           <% if sex == 'total' %>
           The total number of people aged << age >> living in a country.
           <%- else %>
           The total number of << sex + 's' >> aged << age >> living in a country.
           <%- endif %>
+          <%- endif %>
         description_processing: |-
           From HMD Notes: For populations with territorial changes, two sets of population estimates are given for years in which a territorial change occurred. The first set of estimates (identified as year "19xx-") refers to the population just before the territorial change, whereas the second set (identified as year "19xx+") refers to the population just after the change. For example, in France, the data for "1914-" cover the previous territory (i.e., as of December 31, 1913), whereas the data for "1914+" reflect the territorial boundaries as of January 1, 1914.
 
           We have used the "19xx+" population estimates for the year of the territorial change.
         display:
           name: |-
-            {tables.population.variables.population.title} aged << age >><< ', ' + sex + 's' if (sex != 'total') >>
+            {tables.population.variables.population.title}<< 'aged ' + age if (age != 'total') >><< ', ' + sex + 's' if (sex != 'total') >>
         presentation:
           title_public: |-
             {tables.population.variables.population.title} {definitions.others.title_public_dim}
+          title_variant: << sex + 's, ' if sex != 'total' >>
 
   births:
     common:
diff --git a/etl/steps/data/garden/hmd/2024-12-01/hmd.py b/etl/steps/data/garden/hmd/2024-12-01/hmd.py
index 4225c9dc193..471502c0299 100644
--- a/etl/steps/data/garden/hmd/2024-12-01/hmd.py
+++ b/etl/steps/data/garden/hmd/2024-12-01/hmd.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 from owid.catalog import Table
+from owid.catalog import processing as pr
 
 from etl.data_helpers import geo
 from etl.helpers import PathFinder, create_dataset
@@ -84,6 +85,7 @@ def _sanity_check_lt(tb):
         tb=tb_pop,
         col_index=["country", "year", "sex", "age"],
     )
+    tb_pop = add_total_population(tb_pop)
 
     # 5/ Births
     tb_births = process_table(
@@ -92,7 +94,7 @@ def _sanity_check_lt(tb):
     )
 
     def add_birth_rate(tb_pop, tb_births):
-        tb_pop_agg = tb_pop.groupby(["country", "year", "sex"], as_index=False)["population"].sum()
+        tb_pop_agg = tb_pop[tb_pop["age"] == "total"].drop(columns="age")
         tb_births = tb_births.merge(tb_pop_agg, on=["country", "year", "sex"], how="left")
         tb_births["birth_rate"] = tb_births["births"] / tb_births["population"] * 1_000
         tb_births["birth_rate"] = tb_births["birth_rate"].replace([np.inf, -np.inf], np.nan)
@@ -188,6 +190,15 @@ def standardize_sex_cat_names(tb, sex_expected):
     return tb
 
 
+def add_total_population(tb_pop):
+    flag = tb_pop["age"].str.match(r"^(\d{1,3}|\d{3}\+)$")
+    tb_pop_total = tb_pop[flag]
+    tb_pop_total = tb_pop_total.groupby(["country", "year", "sex"], as_index=False)["population"].sum()
+    tb_pop_total["age"] = "total"
+    tb_pop = pr.concat([tb_pop, tb_pop_total], ignore_index=True)
+    return tb_pop
+
+
 def make_table_diffs_ratios(tb: Table) -> Table:
     """Create table with metric differences and ratios.
 
diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py
index c7964f636d9..b649a9b0210 100644
--- a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py
+++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py
@@ -132,7 +132,8 @@ def _prepare_population_table(tb):
 
     Original table is given in years, but we need it in days! We use linear interpolation for that.
     """
-    tb_aux = tb.loc[(tb["sex"] == "total") & ~(tb["age"].str.contains("-")), ["country", "year", "population"]]
+    flag = tb["age"].str.match(r"^(\d{1,3}|\d{3}\+)$")
+    tb_aux = tb.loc[(tb["sex"] == "total") & flag, ["country", "year", "population"]]
     tb_aux = tb_aux.groupby(["country", "year"], as_index=False)["population"].sum()
     ## Assign a day to population. TODO: Check if this is true
     tb_aux["date"] = pd.to_datetime(tb_aux["year"].astype(str) + "-01-01")
diff --git a/etl/steps/data/grapher/hmd/2024-12-01/hmd.py b/etl/steps/data/grapher/hmd/2024-12-01/hmd.py
index 88cb5a79bd8..028b623f60c 100644
--- a/etl/steps/data/grapher/hmd/2024-12-01/hmd.py
+++ b/etl/steps/data/grapher/hmd/2024-12-01/hmd.py
@@ -70,6 +70,7 @@ def keep_only_relevant_dimensions(tb):
         45,
         65,
         80,
+        "total",
     ]
     AGES_SINGLE = list(map(str, AGES_SINGLE)) + ["110+"]
     flag_1 = tb["age"].isin(AGES_SINGLE)