From f25bf83417a993e3b2d2817e3488595f8331fc8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucas=20Rod=C3=A9s-Guirao?= Date: Wed, 4 Dec 2024 16:01:27 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20birth=20rate=20in=20HMD=20(#3690?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 📊 closer look at population estimates by HMD * wip * fix birth rate estimation * drop column --- etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml | 14 +++++++++++--- etl/steps/data/garden/hmd/2024-12-01/hmd.py | 13 ++++++++++++- .../data/garden/hmd/2024-12-03/hmd_country.py | 3 ++- etl/steps/data/grapher/hmd/2024-12-01/hmd.py | 1 + 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml b/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml index 4aab98dfd83..d846e9b485c 100644 --- a/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml +++ b/etl/steps/data/garden/hmd/2024-12-01/hmd.meta.yml @@ -10,7 +10,7 @@ definitions: display_name_dim: |- at << 'birth' if (age == '0') else age >><< ', ' + sex + 's' if (sex != 'total') >>, << type >> title_public_dim: |- - at << age if age != '0' else 'birth'>> + <% if age != 'total' %>at << age if age != '0' else 'birth'>><% endif %> global: life_expectancy: point_1: |- @@ -251,28 +251,36 @@ tables: presentation: topic_tags: - Population Growth - title_variant: << sex + 's, ' if sex != 'total' >> variables: population: title: Population unit: people description_short: |- + <% if age == 'total' %> + <%- if sex == 'total' %> + The total number of people living in a country. + <%- else %> + The total number of << sex + 's' >> living in a country. + <%- endif %> + <%- else %> <% if sex == 'total' %> The total number of people aged << age >> living in a country. <%- else %> The total number of << sex + 's' >> aged << age >> living in a country. <%- endif %> + <%- endif %> description_processing: |- From HMD Notes: For populations with territorial changes, two sets of population estimates are given for years in which a territorial change occurred. The first set of estimates (identified as year "19xx-") refers to the population just before the territorial change, whereas the second set (identified as year "19xx+") refers to the population just after the change. For example, in France, the data for "1914-" cover the previous territory (i.e., as of December 31, 1913), whereas the data for "1914+" reflect the territorial boundaries as of January 1, 1914. We have used the "19xx+" population estimates for the year of the territorial change. display: name: |- - {tables.population.variables.population.title} aged << age >><< ', ' + sex + 's' if (sex != 'total') >> + {tables.population.variables.population.title}<< 'aged ' + age if (age != 'total') >><< ', ' + sex + 's' if (sex != 'total') >> presentation: title_public: |- {tables.population.variables.population.title} {definitions.others.title_public_dim} + title_variant: << sex + 's, ' if sex != 'total' >> births: common: diff --git a/etl/steps/data/garden/hmd/2024-12-01/hmd.py b/etl/steps/data/garden/hmd/2024-12-01/hmd.py index 4225c9dc193..471502c0299 100644 --- a/etl/steps/data/garden/hmd/2024-12-01/hmd.py +++ b/etl/steps/data/garden/hmd/2024-12-01/hmd.py @@ -2,6 +2,7 @@ import numpy as np from owid.catalog import Table +from owid.catalog import processing as pr from etl.data_helpers import geo from etl.helpers import PathFinder, create_dataset @@ -84,6 +85,7 @@ def _sanity_check_lt(tb): tb=tb_pop, col_index=["country", "year", "sex", "age"], ) + tb_pop = add_total_population(tb_pop) # 5/ Births tb_births = process_table( @@ -92,7 +94,7 @@ def _sanity_check_lt(tb): ) def add_birth_rate(tb_pop, tb_births): - tb_pop_agg = tb_pop.groupby(["country", "year", "sex"], as_index=False)["population"].sum() + tb_pop_agg = tb_pop[tb_pop["age"] == "total"].drop(columns="age") tb_births = tb_births.merge(tb_pop_agg, on=["country", "year", "sex"], how="left") tb_births["birth_rate"] = tb_births["births"] / tb_births["population"] * 1_000 tb_births["birth_rate"] = tb_births["birth_rate"].replace([np.inf, -np.inf], np.nan) @@ -188,6 +190,15 @@ def standardize_sex_cat_names(tb, sex_expected): return tb +def add_total_population(tb_pop): + flag = tb_pop["age"].str.match(r"^(\d{1,3}|\d{3}\+)$") + tb_pop_total = tb_pop[flag] + tb_pop_total = tb_pop_total.groupby(["country", "year", "sex"], as_index=False)["population"].sum() + tb_pop_total["age"] = "total" + tb_pop = pr.concat([tb_pop, tb_pop_total], ignore_index=True) + return tb_pop + + def make_table_diffs_ratios(tb: Table) -> Table: """Create table with metric differences and ratios. diff --git a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py index c7964f636d9..b649a9b0210 100644 --- a/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py +++ b/etl/steps/data/garden/hmd/2024-12-03/hmd_country.py @@ -132,7 +132,8 @@ def _prepare_population_table(tb): Original table is given in years, but we need it in days! We use linear interpolation for that. """ - tb_aux = tb.loc[(tb["sex"] == "total") & ~(tb["age"].str.contains("-")), ["country", "year", "population"]] + flag = tb["age"].str.match(r"^(\d{1,3}|\d{3}\+)$") + tb_aux = tb.loc[(tb["sex"] == "total") & flag, ["country", "year", "population"]] tb_aux = tb_aux.groupby(["country", "year"], as_index=False)["population"].sum() ## Assign a day to population. TODO: Check if this is true tb_aux["date"] = pd.to_datetime(tb_aux["year"].astype(str) + "-01-01") diff --git a/etl/steps/data/grapher/hmd/2024-12-01/hmd.py b/etl/steps/data/grapher/hmd/2024-12-01/hmd.py index 88cb5a79bd8..028b623f60c 100644 --- a/etl/steps/data/grapher/hmd/2024-12-01/hmd.py +++ b/etl/steps/data/grapher/hmd/2024-12-01/hmd.py @@ -70,6 +70,7 @@ def keep_only_relevant_dimensions(tb): 45, 65, 80, + "total", ] AGES_SINGLE = list(map(str, AGES_SINGLE)) + ["110+"] flag_1 = tb["age"].isin(AGES_SINGLE)