Skip to content

Commit

Permalink
📊 wittgenstein human capital (#3702)
Browse files Browse the repository at this point in the history
* 📊 wittgenstein human capital

* wip

* add dependency

* remove comment

* wip

* wip

* wip

* wip

* snapshot

* wip meadow

* wip

* wip

* wip garden + grapher

* wip

* wip

* wip

* wip

* working version

* filter & keep only relevant for grapher

* wip dag

* allow 2020 data in meadow, use numbers to identify scenarios

* abstract functions

* fix jinja typo

* wip

* allow some age groups into grapher

* wip

* combined dataset

* renames

* wip

* various fixes

* various fixes

* minor fix

* update indicator titles

* archive

* add scenario description

* estimate prop ourselves

* fix missing column drop

* do not import projections-only dataset

* archive
  • Loading branch information
lucasrodes authored Dec 11, 2024
1 parent b8d4a0c commit ad28b12
Show file tree
Hide file tree
Showing 23 changed files with 3,681 additions and 339 deletions.
8 changes: 8 additions & 0 deletions dag/archive/demography.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,11 @@ steps:
- data://garden/un/2022-07-11/un_wpp
data://grapher/demography/2023-10-10/life_expectancy:
- data://garden/demography/2023-10-09/life_expectancy

## Population by education group (Wittgenstein centre)
data://meadow/demography/2023-04-07/population_education_wittgenstein:
- snapshot://demography/2023-04-07/population_education_wittgenstein.csv
data://garden/demography/2023-04-07/population_education_wittgenstein:
- data://meadow/demography/2023-04-07/population_education_wittgenstein
data://grapher/demography/2023-04-07/population_education_wittgenstein:
- data://garden/demography/2023-04-07/population_education_wittgenstein
12 changes: 11 additions & 1 deletion dag/archive/education.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,14 @@ steps:
- data://garden/education/2018-04-18/literacy_rates

data://grapher/wb/2023-07-10/education:
- data://garden/wb/2023-07-10/education
- data://garden/wb/2023-07-10/education

# Wittgenstein Center Population and Human Capital Projections - Educational Attainment
data://meadow/education/2023-12-15/wittgenstein_center:
- snapshot://education/2023-12-15/wittgenstein_center_data.csv
- snapshot://education/2023-12-15/wittgenstein_center_dictionary.csv
data://garden/education/2023-12-15/wittgenstein_center:
- data://meadow/education/2023-12-15/wittgenstein_center
- data://garden/education/2023-08-14/oecd_education
data://grapher/education/2023-12-15/wittgenstein_center:
- data://garden/education/2023-12-15/wittgenstein_center
32 changes: 25 additions & 7 deletions dag/demography.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,6 @@ steps:
data://grapher/hyde/2024-01-02/all_indicators:
- data://garden/hyde/2024-01-02/all_indicators

## Population by education group (Wittgenstein centre)
data://meadow/demography/2023-04-07/population_education_wittgenstein:
- snapshot://demography/2023-04-07/population_education_wittgenstein.csv
data://garden/demography/2023-04-07/population_education_wittgenstein:
- data://meadow/demography/2023-04-07/population_education_wittgenstein
data://grapher/demography/2023-04-07/population_education_wittgenstein:
- data://garden/demography/2023-04-07/population_education_wittgenstein
## Population doubling times
data://garden/demography/2024-07-18/population_doubling_times:
- data://garden/demography/2024-07-15/population
Expand Down Expand Up @@ -270,3 +263,28 @@ steps:
- data://garden/hmd/2024-12-01/hmd
data://grapher/hmd/2024-12-03/hmd_country:
- data://garden/hmd/2024-12-03/hmd_country

########################################################################
# OTHERS
########################################################################

# Wittgenstein Centre (Projections)
data://meadow/demography/2024-12-06/wittgenstein_human_capital_proj:
- snapshot://demography/2024-12-06/wittgenstein_human_capital.zip
data://garden/demography/2024-12-06/wittgenstein_human_capital_proj:
- data://meadow/demography/2024-12-06/wittgenstein_human_capital_proj
# data://grapher/demography/2024-12-06/wittgenstein_human_capital_proj:
# - data://garden/demography/2024-12-06/wittgenstein_human_capital_proj

# Wittgenstein Centre (Historical)
data://meadow/demography/2024-12-06/wittgenstein_human_capital_historical:
- snapshot://demography/2024-12-06/wittgenstein_human_capital_historical.zip
data://garden/demography/2024-12-06/wittgenstein_human_capital_historical:
- data://meadow/demography/2024-12-06/wittgenstein_human_capital_historical

# Wittgenstein Centre (Projections + Historical)
data://garden/demography/2024-12-06/wittgenstein_human_capital:
- data://garden/demography/2024-12-06/wittgenstein_human_capital_historical
- data://garden/demography/2024-12-06/wittgenstein_human_capital_proj
data://grapher/demography/2024-12-06/wittgenstein_human_capital:
- data://garden/demography/2024-12-06/wittgenstein_human_capital
15 changes: 1 addition & 14 deletions dag/education.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ steps:
data://grapher/education/2023-07-17/education_barro_lee_projections:
- data://garden/education/2023-07-17/education_barro_lee_projections


# Barro and Lee historical estimates
data://meadow/education/2023-07-17/education_lee_lee:
- snapshot://education/2023-07-17/education_lee_lee.xlsx
Expand All @@ -25,8 +24,7 @@ steps:
data://grapher/education/2023-07-17/education_lee_lee:
- data://garden/education/2023-07-17/education_lee_lee


# OECD historical education data
# OECD historical education data
data://meadow/education/2023-08-09/clio_infra_education:
- snapshot://education/2023-08-09/years_of_education.xlsx
- snapshot://education/2023-08-09/years_of_education_gini.xlsx
Expand Down Expand Up @@ -80,16 +78,6 @@ steps:
data://grapher/oecd/2023-12-06/pisa:
- data://garden/oecd/2023-12-06/pisa

# Wittgenstein Center Population and Human Capital Projections - Educational Attainment
data://meadow/education/2023-12-15/wittgenstein_center:
- snapshot://education/2023-12-15/wittgenstein_center_data.csv
- snapshot://education/2023-12-15/wittgenstein_center_dictionary.csv
data://garden/education/2023-12-15/wittgenstein_center:
- data://meadow/education/2023-12-15/wittgenstein_center
- data://garden/education/2023-08-14/oecd_education
data://grapher/education/2023-12-15/wittgenstein_center:
- data://garden/education/2023-12-15/wittgenstein_center

# UNESCO data on other policy related education indicators
data://meadow/unesco/2024-06-16/education_opri:
- snapshot://unesco/2024-06-16/education_opri.zip
Expand Down Expand Up @@ -117,7 +105,6 @@ steps:
data://grapher/unesco/2024-11-21/enrolment_rates:
- data://garden/unesco/2024-11-21/enrolment_rates


# World Bank EdStats
data://meadow/wb/2024-11-04/edstats:
- snapshot://wb/2024-11-04/edstats.csv
Expand Down
147 changes: 147 additions & 0 deletions etl/steps/data/garden/demography/2024-12-06/shared.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import owid.catalog.processing as pr

from etl.data_helpers import geo

# Columns index
COLUMNS_INDEX = [
"country",
"year",
"scenario",
"sex",
"age",
"education",
]


def make_table(
tb,
country_mapping_path,
dtypes=None,
all_single=False,
all_range=False,
cols_single=None,
cols_range=None,
per_10=None,
per_100=None,
per_1000=None,
div_10=None,
div_100=None,
div_1000=None,
):
dtypes = {**{"scenario": "UInt8", "country": "category"}, **(dtypes or {})}
tb = tb.astype(dtypes)

if all_single:
tb["year"] = tb["year"].astype("Int32")
elif all_range:
assert tb["year"].str.contains("-").all(), "Some years are not ranges!"
tb["year"] = tb["year"].str.extract(r"(\d{4}\.?0?)$").astype("Float32").astype("Int32")
else:
tb = consolidate_year_single_and_ranges(
tb=tb,
cols_single=cols_single,
cols_range=cols_range,
)

# Ensure expected scenario IDs
assert set(tb["scenario"].unique()) == set(range(1, 6))

# Harmonize country names
tb = geo.harmonize_countries(
df=tb,
countries_file=country_mapping_path,
show_full_warning=False,
)

# Scale
tb = scale_values(
tb, per_10=per_10, per_100=per_100, per_1000=per_1000, div_10=div_10, div_100=div_100, div_1000=div_1000
)
return tb


def scale_values(tb, per_10=None, per_100=None, per_1000=None, div_10=None, div_100=None, div_1000=None):
if per_10 is not None:
for col in per_10:
tb[col] *= 10
if per_100 is not None:
for col in per_100:
tb[col] *= 100
if per_1000 is not None:
for col in per_1000:
tb[col] *= 1000
if div_10 is not None:
for col in div_10:
tb[col] /= 10
if div_100 is not None:
for col in div_100:
tb[col] /= 100
if div_1000 is not None:
for col in div_1000:
tb[col] /= 1000
return tb


def consolidate_year_single_and_ranges(tb, cols_single, cols_range):
# 1) Periods like "2020-2025" are mapped to 2025
# 2) While doing 1, we should make sure that the tables are properly aligned
flag = tb["year"].str.contains("-")

# Check columns for single year data
single_year_cols = set(tb.loc[~flag].dropna(axis=1, how="all").columns) - set(COLUMNS_INDEX)
assert single_year_cols == set(cols_single), f"Unexpected columns in single year data: {single_year_cols}"

# Check columns for range year data
range_year_cols = set(tb.loc[flag].dropna(axis=1, how="all").columns) - set(COLUMNS_INDEX)
assert range_year_cols == set(cols_range), f"Unexpected columns in range year data: {range_year_cols}"

# Fix year type
tb["year"] = tb["year"].str.extract(r"(\d{4}\.?0?)$").astype("Float32").astype("Int32")

# Create two tables: year range and single year
cols_index = list(tb.columns.intersection(COLUMNS_INDEX))
tb_single = tb[cols_index + cols_single].dropna(subset=cols_single, how="all")
tb_range = tb[cols_index + cols_range].dropna(subset=cols_range, how="all")

# Merge back
tb = tb_single.merge(tb_range, on=cols_index, how="outer")

return tb


def add_dim_some_education(tb):
"""Add dimension "some education" to sex+age+education table.
It only adds it for sex=total and age=total.
"""
SOME_EDUCATION = "some_education"
# Add education="some_education" (only for sex=total and age=total, and indicator 'pop')
cols_index = ["country", "year", "age", "sex", "scenario"]
tb_tmp = tb.loc[tb["education"].isin(["total", "no_education"]), cols_index + ["education", "pop"]]
tb_tmp = tb_tmp.pivot(index=cols_index, columns="education", values="pop").reset_index().dropna()
tb_tmp["some_education"] = tb_tmp["total"] - tb_tmp["no_education"]
assert (tb_tmp["some_education"] >= 0).all()
tb_tmp = tb_tmp.melt(id_vars=cols_index, value_vars=SOME_EDUCATION, var_name="education", value_name="pop")

# Add new education
tb["education"] = tb["education"].cat.add_categories([SOME_EDUCATION])

dtypes = tb.dtypes
tb = pr.concat([tb, tb_tmp], ignore_index=True)
tb = tb.astype(dtypes)

return tb


def get_index_columns(tb):
cols_index = list(tb.columns.intersection(COLUMNS_INDEX))
return cols_index


def add_prop(tb):
# Add
tbx = tb[tb["education"] == "total"].drop(columns=["education", "assr"])
tb = tb.merge(tbx, on=["country", "year", "age", "sex", "scenario"], suffixes=["", "_total"])
tb["prop"] = (100 * tb["pop"] / tb["pop_total"]).copy_metadata(tb["pop"])
tb = tb.drop(columns=["pop_total"])
return tb
Loading

0 comments on commit ad28b12

Please sign in to comment.