📊 hmd update (#3642)

* 📊 hmd update * snapshot * wip * dag * archive old hmd * wip * wip * ci/cd * wip * wip * wip * wip * wip * change column names * wip * wip * wip * propagate snapshot metadata * wip * wip * garden * missing dimensions * improve debug message * wip * wip * fix dimension * wip * wip * wip * nit memory opt * wip * fix origins propagation * grapher * dataset title * minor metadata update
owid · Dec 2, 2024 · bdf71c9 · bdf71c9
1 parent e0b76f9
commit bdf71c9
Show file tree

Hide file tree

Showing 13 changed files with 1,416 additions and 12 deletions.
diff --git a/dag/archive/demography.yml b/dag/archive/demography.yml
@@ -52,9 +52,21 @@ steps:
   data-private://grapher/un/2024-07-11/un_wpp_full:
     - data-private://garden/un/2024-07-11/un_wpp
 
-    # Population density
+  # Population density
   data://garden/demography/2023-06-12/population_density:
     - data://garden/demography/2023-03-31/population
     - data://garden/faostat/2024-03-14/faostat_rl
   data://grapher/demography/2023-06-12/population_density:
     - data://garden/demography/2023-06-12/population_density
+
+  # HMD
+  data://meadow/hmd/2022-12-07/life_tables:
+    - snapshot://hmd/2022-12-07/hmd.zip
+  data://garden/hmd/2022-12-07/life_tables:
+    - data://meadow/hmd/2022-12-07/life_tables
+
+  # Survivorship ages (HMD-derived)
+  data://garden/demography/2023-09-27/survivor_percentiles:
+    - data://garden/hmd/2023-09-19/hmd
+  data://grapher/demography/2023-09-27/survivor_percentiles:
+    - data://garden/demography/2023-09-27/survivor_percentiles
diff --git a/dag/demography.yml b/dag/demography.yml
@@ -123,18 +123,20 @@ steps:
   data://grapher/hmd/2023-09-19/hmd:
     - data://garden/hmd/2023-09-19/hmd
 
+  # Human Mortality Database
+  data://meadow/hmd/2024-11-27/hmd:
+    - snapshot://hmd/2024-11-27/hmd.zip
+  data://garden/hmd/2024-11-27/hmd:
+    - data://meadow/hmd/2024-11-27/hmd
+  data://grapher/hmd/2024-11-27/hmd:
+    - data://garden/hmd/2024-11-27/hmd
+
   # Gini Life Expectancy Inequality
   data://garden/demography/2023-10-04/gini_le:
     - data://garden/demography/2023-10-03/life_tables
   data://grapher/demography/2023-10-04/gini_le:
     - data://garden/demography/2023-10-04/gini_le
 
-  # Survivorship ages (HMD-derived)
-  data://garden/demography/2023-09-27/survivor_percentiles:
-    - data://garden/hmd/2023-09-19/hmd
-  data://grapher/demography/2023-09-27/survivor_percentiles:
-    - data://garden/demography/2023-09-27/survivor_percentiles
-
   # Phi-gender life expectancy inequality
   data://garden/demography/2023-10-03/phi_gender_le:
     - data://garden/demography/2023-10-03/life_tables
@@ -245,3 +247,9 @@ steps:
     - data://meadow/demography/2024-11-26/multiple_births
   data://grapher/demography/2024-11-26/multiple_births:
     - data://garden/demography/2024-11-26/multiple_births
+
+  # Survivorship ages (HMD-derived)
+  data://garden/demography/2024-12-02/survivor_percentiles:
+    - data://garden/hmd/2024-11-27/hmd
+  data://grapher/demography/2024-12-02/survivor_percentiles:
+    - data://garden/demography/2024-12-02/survivor_percentiles
diff --git a/dag/main.yml b/dag/main.yml
@@ -130,11 +130,6 @@ steps:
     - data://garden/regions/2023-01-01/regions
   data://grapher/technology/2022/internet:
     - data://garden/technology/2022/internet
-  # HMD
-  data://meadow/hmd/2022-12-07/life_tables:
-    - snapshot://hmd/2022-12-07/hmd.zip
-  data://garden/hmd/2022-12-07/life_tables:
-    - data://meadow/hmd/2022-12-07/life_tables
 
   # UNDP
   data://meadow/un/2024-04-09/undp_hdr:

diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.meta.yml
@@ -0,0 +1,44 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Life Expectancy
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/en/latest/architecture/metadata/reference/dataset/
+dataset:
+  title: Survivorship percentiles (HMD; Alvarez and Vaupel 2023)
+  update_period_days: 365
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/en/latest/architecture/metadata/reference/tables/
+tables:
+  survivor_percentiles:
+    variables:
+      age:
+        title: Survivorship age
+        unit: years
+        processing_level: major
+        description_short: |-
+          <%- if percentile == 1 -%>
+          The age until which the 1st percentile (99% of the population) of the population would survive until, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year.
+          <%- else -%>
+          The age until which the << percentile>>th percentile (<< 100 - percentile|int>>% of the population) of the population would survive until, if they experienced the same age-specific death rates throughout their whole lives as the age-specific death rates seen in that particular year.
+          <%- endif -%>
+
+        description_processing: |-
+          This was calculated with the method published in Alvarez and Vaupel (2023), with code provided by the authors:
+
+          Jesús-Adrián Alvarez, James W. Vaupel; Mortality as a Function of Survival. Demography 1 February 2023; 60 (1): 327–342. doi: https://doi.org/10.1215/00703370-10429097
+
+          These estimates were regenerated for data from more recent years in the Human Mortality Database.
+
+          Original R code from: https://github.com/jssalvrz/s-ages
+        description_key:
+          - This is calculated with the period life tables indicators.
+        display:
+          numDecimalPlaces: 1
+        presentation:
+          attribution: |-
+            Alvarez & Vaupel (2023); Human Mortality Database (2024)
diff --git a/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py b/etl/steps/data/garden/demography/2024-12-02/survivor_percentiles.py
@@ -0,0 +1,137 @@
+"""Load a meadow dataset and create a garden dataset.
+
+Methods used here are taken from https://github.com/jssalvrz/s-ages. Authors of Citation: Alvarez, J.-A., & Vaupel, J. W. (2023). Mortality as a Function of Survival. Demography, 60(1), 327–342. https://doi.org/10.1215/00703370-10429097
+
+
+Dr. Saloni Dattani translated the R scripts into Python:
+    - Original: https://github.com/jssalvrz/s-ages
+    - Translated: https://github.com/saloni-nd/misc/tree/main/survivorship-ages
+
+Lucas Rodes-Guirao adapted the python code for ETL.
+"""
+
+import numpy as np
+import pandas as pd
+from owid.catalog import Table
+from scipy.integrate import cumulative_trapezoid as cumtrapz
+from scipy.interpolate import InterpolatedUnivariateSpline
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    paths.log.info("load data.")
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("hmd")
+
+    # Read table from meadow dataset.
+    tb_deaths = ds_meadow.read("deaths")
+    tb_exposure = ds_meadow.read("exposures")
+
+    #
+    # Process data.
+    #
+    # Combine tables, drop NaNs
+    tb = tb_deaths.merge(tb_exposure, on=["country", "year", "sex", "age"], how="outer")
+    tb = tb.dropna(subset=["deaths", "exposure"], how="any")
+
+    # Keep format="1x1", and sex="both"
+    paths.log.info("keep period & 1-year data.")
+    tb = tb.loc[tb["age"].str.match(r"^(\d{1,3}|d{3}\+)$") & (tb["type"] == "period")]
+
+    # Drop unused columns
+    tb = tb.drop(columns=["type"])
+
+    # 110+ -> 110
+    paths.log.info("replace 110+ -> 100, set Dtypes.")
+    tb["age"] = tb["age"].replace({"110+": "110"}).astype(int)
+
+    # Sort
+    tb = tb.sort_values(["year", "age"])
+
+    # Actual calculation
+    paths.log.info("calculate surviorship ages (can take some minutes)...")
+    columns_grouping = ["country", "sex", "year"]
+    tb = tb.groupby(columns_grouping).apply(lambda group: obtain_survivorship_ages(group)).reset_index()  # type: ignore
+
+    # Unpivot
+    paths.log.info("reshape table")
+    tb = tb.melt(
+        id_vars=["country", "sex", "year"],
+        value_vars=["s1", "s10", "s20", "s30", "s40", "s50", "s60", "s70", "s80", "s90", "s99"],
+        var_name="percentile",
+        value_name="age",
+    )
+    tb = tb.dropna(subset=["percentile"])
+    tb["percentile"] = tb["percentile"].str.replace("s", "").astype(int)
+    tb["percentile"] = 100 - tb["percentile"]
+
+    # Propagate metadata
+    tb["age"].metadata.origins = tb_exposure["exposure"].m.origins.copy()
+
+    # Set index
+    paths.log.info("format")
+    tb = tb.format(["country", "year", "sex", "percentile"], short_name="survivor_percentiles")
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+
+def obtain_survivorship_ages(tb_group: Table, start_age: int = 0, end_age: int = 110) -> pd.DataFrame:
+    """Get survivorship ages given a life and deaths table.
+
+    Output dataframe has a column for each percentile of survivorship age.
+
+    tb_group is expected to be a subset of the compelte table. It should only concern a particular (country, year, sex) triple.
+    """
+    # Step 1: Apply splines, get Mx for each (country, year, sex, age)
+    ## Define splines
+    ### We could use CubicSpline (k=3 order), but it provides slightly different results hence, for precaution, we sticked to InterpolatedUnivariateSpline.
+    ### This is equivalent to R function interpSpline
+    spline_deaths = InterpolatedUnivariateSpline(tb_group["age"], tb_group["deaths"], k=3)
+    spline_exposures = InterpolatedUnivariateSpline(tb_group["age"], tb_group["exposure"], k=3)
+
+    ## Define age range (with step 0.01)
+    age_range = np.arange(start_age, end_age, 0.01)
+
+    # Run splines over age range
+    deaths_spline = np.abs(spline_deaths(age_range))
+    exposure_spline = np.abs(spline_exposures(age_range))
+    exposure_spline[exposure_spline == 0] = np.nan
+    survival_age_spline = np.abs(deaths_spline / exposure_spline)
+
+    # Step 2: Calculate survival, density, hazard, and cumulative hazards
+    ## Estimate parameters
+    Hx = cumtrapz(y=survival_age_spline, x=age_range, initial=0)  # Hazard CDF
+    Sx = np.exp(-Hx)  # Survivor function
+
+    # Step 3: Calculate survivorship ages from parameters
+    out = {}
+    out["s0"] = max(age_range)
+    ## I'm using a for loop to simplify the logic here
+    for i in range(1, 101):
+        try:
+            sx_rounded = np.ceil((100 * Sx).round(3))
+            value = age_range[sx_rounded == i][0]
+            out[f"s{i}"] = value
+        except IndexError:
+            out[f"s{i}"] = np.nan
+
+    # Create output dataframe
+    df = pd.DataFrame(out, index=[0])
+
+    return df
diff --git a/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json b/etl/steps/data/garden/hmd/2024-11-27/hmd.countries.json
@@ -0,0 +1,52 @@
+{
+  "Australia": "Australia",
+  "Austria": "Austria",
+  "Belarus": "Belarus",
+  "Belgium": "Belgium",
+  "Bulgaria": "Bulgaria",
+  "Canada": "Canada",
+  "Chile": "Chile",
+  "Croatia": "Croatia",
+  "Czechia": "Czechia",
+  "Denmark": "Denmark",
+  "East Germany": "East Germany",
+  "Estonia": "Estonia",
+  "Finland": "Finland",
+  "Germany": "Germany",
+  "Greece": "Greece",
+  "Hong Kong": "Hong Kong",
+  "Hungary": "Hungary",
+  "Iceland": "Iceland",
+  "Ireland": "Ireland",
+  "Japan": "Japan",
+  "Latvia": "Latvia",
+  "Lithuania": "Lithuania",
+  "Luxembourg": "Luxembourg",
+  "Netherlands": "Netherlands",
+  "New Zealand": "New Zealand",
+  "Norway": "Norway",
+  "Poland": "Poland",
+  "Portugal": "Portugal",
+  "Republic of Korea": "South Korea",
+  "Russia": "Russia",
+  "Slovenia": "Slovenia",
+  "Spain": "Spain",
+  "Sweden": "Sweden",
+  "Switzerland": "Switzerland",
+  "Taiwan": "Taiwan",
+  "Ukraine": "Ukraine",
+  "United Kingdom": "United Kingdom",
+  "West Germany": "West Germany",
+  "England and Wales, Civilian National Population": "England and Wales (Civilians)",
+  "England and Wales, Total Population": "England and Wales",
+  "France, Civilian Population": "France (Civilians)",
+  "France, Total Population": "France",
+  "Israel, Total Population": "Israel",
+  "Italy ": "Italy",
+  "New Zealand -- Maori": "New Zealand (Maori)",
+  "New Zealand -- Non-Maori": "New Zealand (Non-Maori)",
+  "Northern Ireland": "Northern Ireland",
+  "Scotland": "Scotland",
+  "Slovakia ": "Slovakia",
+  "The United States of America": "United States"
+}