📊 wittgenstein human capital (#3702)

* 📊 wittgenstein human capital * wip * add dependency * remove comment * wip * wip * wip * wip * snapshot * wip meadow * wip * wip * wip garden + grapher * wip * wip * wip * wip * working version * filter & keep only relevant for grapher * wip dag * allow 2020 data in meadow, use numbers to identify scenarios * abstract functions * fix jinja typo * wip * allow some age groups into grapher * wip * combined dataset * renames * wip * various fixes * various fixes * minor fix * update indicator titles * archive * add scenario description * estimate prop ourselves * fix missing column drop * do not import projections-only dataset * archive
owid · Dec 11, 2024 · ad28b12 · ad28b12
1 parent b8d4a0c
commit ad28b12
Show file tree

Hide file tree

Showing 23 changed files with 3,681 additions and 339 deletions.
diff --git a/dag/archive/demography.yml b/dag/archive/demography.yml
@@ -118,3 +118,11 @@ steps:
     - data://garden/un/2022-07-11/un_wpp
   data://grapher/demography/2023-10-10/life_expectancy:
     - data://garden/demography/2023-10-09/life_expectancy
+
+  ## Population by education group (Wittgenstein centre)
+  data://meadow/demography/2023-04-07/population_education_wittgenstein:
+    - snapshot://demography/2023-04-07/population_education_wittgenstein.csv
+  data://garden/demography/2023-04-07/population_education_wittgenstein:
+    - data://meadow/demography/2023-04-07/population_education_wittgenstein
+  data://grapher/demography/2023-04-07/population_education_wittgenstein:
+    - data://garden/demography/2023-04-07/population_education_wittgenstein
diff --git a/dag/archive/education.yml b/dag/archive/education.yml
@@ -9,4 +9,14 @@ steps:
     - data://garden/education/2018-04-18/literacy_rates
 
   data://grapher/wb/2023-07-10/education:
-    - data://garden/wb/2023-07-10/education
+    - data://garden/wb/2023-07-10/education
+
+  # Wittgenstein Center Population and Human Capital Projections - Educational Attainment
+  data://meadow/education/2023-12-15/wittgenstein_center:
+    - snapshot://education/2023-12-15/wittgenstein_center_data.csv
+    - snapshot://education/2023-12-15/wittgenstein_center_dictionary.csv
+  data://garden/education/2023-12-15/wittgenstein_center:
+    - data://meadow/education/2023-12-15/wittgenstein_center
+    - data://garden/education/2023-08-14/oecd_education
+  data://grapher/education/2023-12-15/wittgenstein_center:
+    - data://garden/education/2023-12-15/wittgenstein_center
diff --git a/dag/demography.yml b/dag/demography.yml
@@ -88,13 +88,6 @@ steps:
   data://grapher/hyde/2024-01-02/all_indicators:
     - data://garden/hyde/2024-01-02/all_indicators
 
-  ## Population by education group (Wittgenstein centre)
-  data://meadow/demography/2023-04-07/population_education_wittgenstein:
-    - snapshot://demography/2023-04-07/population_education_wittgenstein.csv
-  data://garden/demography/2023-04-07/population_education_wittgenstein:
-    - data://meadow/demography/2023-04-07/population_education_wittgenstein
-  data://grapher/demography/2023-04-07/population_education_wittgenstein:
-    - data://garden/demography/2023-04-07/population_education_wittgenstein
   ## Population doubling times
   data://garden/demography/2024-07-18/population_doubling_times:
     - data://garden/demography/2024-07-15/population
@@ -270,3 +263,28 @@ steps:
     - data://garden/hmd/2024-12-01/hmd
   data://grapher/hmd/2024-12-03/hmd_country:
     - data://garden/hmd/2024-12-03/hmd_country
+
+  ########################################################################
+  # OTHERS
+  ########################################################################
+
+  # Wittgenstein Centre (Projections)
+  data://meadow/demography/2024-12-06/wittgenstein_human_capital_proj:
+    - snapshot://demography/2024-12-06/wittgenstein_human_capital.zip
+  data://garden/demography/2024-12-06/wittgenstein_human_capital_proj:
+    - data://meadow/demography/2024-12-06/wittgenstein_human_capital_proj
+  # data://grapher/demography/2024-12-06/wittgenstein_human_capital_proj:
+  #   - data://garden/demography/2024-12-06/wittgenstein_human_capital_proj
+
+  # Wittgenstein Centre (Historical)
+  data://meadow/demography/2024-12-06/wittgenstein_human_capital_historical:
+    - snapshot://demography/2024-12-06/wittgenstein_human_capital_historical.zip
+  data://garden/demography/2024-12-06/wittgenstein_human_capital_historical:
+    - data://meadow/demography/2024-12-06/wittgenstein_human_capital_historical
+
+  # Wittgenstein Centre (Projections + Historical)
+  data://garden/demography/2024-12-06/wittgenstein_human_capital:
+    - data://garden/demography/2024-12-06/wittgenstein_human_capital_historical
+    - data://garden/demography/2024-12-06/wittgenstein_human_capital_proj
+  data://grapher/demography/2024-12-06/wittgenstein_human_capital:
+    - data://garden/demography/2024-12-06/wittgenstein_human_capital
diff --git a/dag/education.yml b/dag/education.yml
@@ -11,7 +11,6 @@ steps:
   data://grapher/education/2023-07-17/education_barro_lee_projections:
     - data://garden/education/2023-07-17/education_barro_lee_projections
 
-
   # Barro and Lee historical estimates
   data://meadow/education/2023-07-17/education_lee_lee:
     - snapshot://education/2023-07-17/education_lee_lee.xlsx
@@ -25,8 +24,7 @@ steps:
   data://grapher/education/2023-07-17/education_lee_lee:
     - data://garden/education/2023-07-17/education_lee_lee
 
-
-# OECD historical education data
+  # OECD historical education data
   data://meadow/education/2023-08-09/clio_infra_education:
     - snapshot://education/2023-08-09/years_of_education.xlsx
     - snapshot://education/2023-08-09/years_of_education_gini.xlsx
@@ -80,16 +78,6 @@ steps:
   data://grapher/oecd/2023-12-06/pisa:
     - data://garden/oecd/2023-12-06/pisa
 
-  # Wittgenstein Center Population and Human Capital Projections - Educational Attainment
-  data://meadow/education/2023-12-15/wittgenstein_center:
-    - snapshot://education/2023-12-15/wittgenstein_center_data.csv
-    - snapshot://education/2023-12-15/wittgenstein_center_dictionary.csv
-  data://garden/education/2023-12-15/wittgenstein_center:
-    - data://meadow/education/2023-12-15/wittgenstein_center
-    - data://garden/education/2023-08-14/oecd_education
-  data://grapher/education/2023-12-15/wittgenstein_center:
-    - data://garden/education/2023-12-15/wittgenstein_center
-
   # UNESCO data on other policy related education indicators
   data://meadow/unesco/2024-06-16/education_opri:
     - snapshot://unesco/2024-06-16/education_opri.zip
@@ -117,7 +105,6 @@ steps:
   data://grapher/unesco/2024-11-21/enrolment_rates:
     - data://garden/unesco/2024-11-21/enrolment_rates
 
-
   # World Bank EdStats
   data://meadow/wb/2024-11-04/edstats:
     - snapshot://wb/2024-11-04/edstats.csv

diff --git a/etl/steps/data/garden/demography/2024-12-06/shared.py b/etl/steps/data/garden/demography/2024-12-06/shared.py
@@ -0,0 +1,147 @@
+import owid.catalog.processing as pr
+
+from etl.data_helpers import geo
+
+# Columns index
+COLUMNS_INDEX = [
+    "country",
+    "year",
+    "scenario",
+    "sex",
+    "age",
+    "education",
+]
+
+
+def make_table(
+    tb,
+    country_mapping_path,
+    dtypes=None,
+    all_single=False,
+    all_range=False,
+    cols_single=None,
+    cols_range=None,
+    per_10=None,
+    per_100=None,
+    per_1000=None,
+    div_10=None,
+    div_100=None,
+    div_1000=None,
+):
+    dtypes = {**{"scenario": "UInt8", "country": "category"}, **(dtypes or {})}
+    tb = tb.astype(dtypes)
+
+    if all_single:
+        tb["year"] = tb["year"].astype("Int32")
+    elif all_range:
+        assert tb["year"].str.contains("-").all(), "Some years are not ranges!"
+        tb["year"] = tb["year"].str.extract(r"(\d{4}\.?0?)$").astype("Float32").astype("Int32")
+    else:
+        tb = consolidate_year_single_and_ranges(
+            tb=tb,
+            cols_single=cols_single,
+            cols_range=cols_range,
+        )
+
+    # Ensure expected scenario IDs
+    assert set(tb["scenario"].unique()) == set(range(1, 6))
+
+    # Harmonize country names
+    tb = geo.harmonize_countries(
+        df=tb,
+        countries_file=country_mapping_path,
+        show_full_warning=False,
+    )
+
+    # Scale
+    tb = scale_values(
+        tb, per_10=per_10, per_100=per_100, per_1000=per_1000, div_10=div_10, div_100=div_100, div_1000=div_1000
+    )
+    return tb
+
+
+def scale_values(tb, per_10=None, per_100=None, per_1000=None, div_10=None, div_100=None, div_1000=None):
+    if per_10 is not None:
+        for col in per_10:
+            tb[col] *= 10
+    if per_100 is not None:
+        for col in per_100:
+            tb[col] *= 100
+    if per_1000 is not None:
+        for col in per_1000:
+            tb[col] *= 1000
+    if div_10 is not None:
+        for col in div_10:
+            tb[col] /= 10
+    if div_100 is not None:
+        for col in div_100:
+            tb[col] /= 100
+    if div_1000 is not None:
+        for col in div_1000:
+            tb[col] /= 1000
+    return tb
+
+
+def consolidate_year_single_and_ranges(tb, cols_single, cols_range):
+    # 1) Periods like "2020-2025" are mapped to 2025
+    # 2) While doing 1, we should make sure that the tables are properly aligned
+    flag = tb["year"].str.contains("-")
+
+    # Check columns for single year data
+    single_year_cols = set(tb.loc[~flag].dropna(axis=1, how="all").columns) - set(COLUMNS_INDEX)
+    assert single_year_cols == set(cols_single), f"Unexpected columns in single year data: {single_year_cols}"
+
+    # Check columns for range year data
+    range_year_cols = set(tb.loc[flag].dropna(axis=1, how="all").columns) - set(COLUMNS_INDEX)
+    assert range_year_cols == set(cols_range), f"Unexpected columns in range year data: {range_year_cols}"
+
+    # Fix year type
+    tb["year"] = tb["year"].str.extract(r"(\d{4}\.?0?)$").astype("Float32").astype("Int32")
+
+    # Create two tables: year range and single year
+    cols_index = list(tb.columns.intersection(COLUMNS_INDEX))
+    tb_single = tb[cols_index + cols_single].dropna(subset=cols_single, how="all")
+    tb_range = tb[cols_index + cols_range].dropna(subset=cols_range, how="all")
+
+    # Merge back
+    tb = tb_single.merge(tb_range, on=cols_index, how="outer")
+
+    return tb
+
+
+def add_dim_some_education(tb):
+    """Add dimension "some education" to sex+age+education table.
+
+    It only adds it for sex=total and age=total.
+    """
+    SOME_EDUCATION = "some_education"
+    # Add education="some_education" (only for sex=total and age=total, and indicator 'pop')
+    cols_index = ["country", "year", "age", "sex", "scenario"]
+    tb_tmp = tb.loc[tb["education"].isin(["total", "no_education"]), cols_index + ["education", "pop"]]
+    tb_tmp = tb_tmp.pivot(index=cols_index, columns="education", values="pop").reset_index().dropna()
+    tb_tmp["some_education"] = tb_tmp["total"] - tb_tmp["no_education"]
+    assert (tb_tmp["some_education"] >= 0).all()
+    tb_tmp = tb_tmp.melt(id_vars=cols_index, value_vars=SOME_EDUCATION, var_name="education", value_name="pop")
+
+    # Add new education
+    tb["education"] = tb["education"].cat.add_categories([SOME_EDUCATION])
+
+    dtypes = tb.dtypes
+    tb = pr.concat([tb, tb_tmp], ignore_index=True)
+    tb = tb.astype(dtypes)
+
+    return tb
+
+
+def get_index_columns(tb):
+    cols_index = list(tb.columns.intersection(COLUMNS_INDEX))
+    return cols_index
+
+
+def add_prop(tb):
+    # Add
+    tbx = tb[tb["education"] == "total"].drop(columns=["education", "assr"])
+    tb = tb.merge(tbx, on=["country", "year", "age", "sex", "scenario"], suffixes=["", "_total"])
+    tb["prop"] = (100 * tb["pop"] / tb["pop_total"]).copy_metadata(tb["pop"])
+    tb = tb.drop(columns=["pop_total"])
+    return tb