📊 share of population educated (#3715)

* 📊 share of population educated * wip * add adult group * fix * working * dag typo * archive step * format
owid · Dec 11, 2024 · 883f058 · 883f058
1 parent 7d89c47
commit 883f058
Show file tree

Hide file tree

Showing 7 changed files with 204 additions and 3 deletions.
diff --git a/dag/education.yml b/dag/education.yml
@@ -24,7 +24,7 @@ steps:
   data://grapher/education/2023-07-17/education_lee_lee:
     - data://garden/education/2023-07-17/education_lee_lee
 
-  # OECD historical education data
+  # CLIO/INFRA
   data://meadow/education/2023-08-09/clio_infra_education:
     - snapshot://education/2023-08-09/years_of_education.xlsx
     - snapshot://education/2023-08-09/years_of_education_gini.xlsx
@@ -36,6 +36,7 @@ steps:
   data://grapher/education/2023-08-09/clio_infra_education:
     - data://garden/education/2023-08-09/clio_infra_education
 
+  # OECD historical education data
   data://meadow/education/2023-08-14/oecd_education:
     - snapshot://education/2023-08-14/oecd_education.csv
   data://garden/education/2023-08-14/oecd_education:
@@ -44,20 +45,23 @@ steps:
   data://grapher/education/2023-08-14/oecd_education:
     - data://garden/education/2023-08-14/oecd_education
 
+  # ILOstat Unemployment
   data://meadow/ilostat/2023-09-19/unemployment:
     - snapshot://ilostat/2023-09-19/unemployment.csv
   data://garden/ilostat/2023-09-19/unemployment:
     - data://meadow/ilostat/2023-09-19/unemployment
   data://grapher/ilostat/2023-09-19/unemployment:
     - data://garden/ilostat/2023-09-19/unemployment
 
+  # ILOstat Employment
   data://meadow/ilostat/2023-09-19/employment:
     - snapshot://ilostat/2023-09-19/employment.csv
   data://garden/ilostat/2023-09-19/employment:
     - data://meadow/ilostat/2023-09-19/employment
   data://grapher/ilostat/2023-09-19/employment:
     - data://garden/ilostat/2023-09-19/employment
 
+  # ILOstat Average Work Hours
   data://meadow/ilostat/2023-09-19/average_work_hours:
     - snapshot://ilostat/2023-09-19/average_work_hours.csv
   data://garden/ilostat/2023-09-19/average_work_hours:
@@ -78,6 +82,13 @@ steps:
   data://grapher/oecd/2023-12-06/pisa:
     - data://garden/oecd/2023-12-06/pisa
 
+  # OECD + Wittgenstein: People with education (long-run)
+  data://garden/education/2024-12-11/people_with_education:
+    - data://garden/demography/2024-12-06/wittgenstein_human_capital
+    - data://garden/education/2023-08-14/oecd_education
+  data://grapher/education/2024-12-11/people_with_education:
+    - data://garden/education/2024-12-11/people_with_education
+
   # UNESCO data on other policy related education indicators
   data://meadow/unesco/2024-06-16/education_opri:
     - snapshot://unesco/2024-06-16/education_opri.zip

diff --git a/etl/steps/data/garden/demography/2024-12-06/shared.py b/etl/steps/data/garden/demography/2024-12-06/shared.py
@@ -133,6 +133,30 @@ def add_dim_some_education(tb):
     return tb
 
 
+def add_dim_15plus(tb):
+    # Pivot table to have two columns: "0-14" and "total"
+    tb_adults = tb.loc[tb["age"].isin(["0-4", "5-9", "10-14", "total"]) & (tb["education"] != "total")]
+    cols_index = ["country", "scenario", "sex", "education", "year"]
+    tb_adults = tb_adults.pivot(index=cols_index, columns="age", values="pop").reset_index()
+    # Only estimate values for adults when "total" is not NA
+    tb_adults = tb_adults.dropna(subset=["total"])
+    # Estimate adults as "0-14" - 15+
+    # Fill with zero NAs of agr group "0-14". NAs mostly come from 'doesn't apply' (e.g. primary education for 0-14)
+    tb_adults["15+"] = (
+        tb_adults["total"] - tb_adults["0-4"].fillna(0) - tb_adults["5-9"].fillna(0) - tb_adults["10-14"].fillna(0)
+    )
+    # Drop columns
+    tb_adults = tb_adults.drop(columns=["0-4", "5-9", "10-14", "total"])
+    # Replace negative values for zero
+    flag = tb_adults["15+"] < 0
+    tb_adults.loc[flag, "15+"] = 0
+    # Shape table
+    tb_adults = tb_adults.melt(id_vars=cols_index, value_name="pop", var_name="age")
+    # Concatenate with original table
+    tb = pr.concat([tb, tb_adults], ignore_index=True)
+    return tb
+
+
 def get_index_columns(tb):
     cols_index = list(tb.columns.intersection(COLUMNS_INDEX))
     return cols_index

diff --git a/etl/steps/data/garden/demography/2024-12-06/wittgenstein_human_capital_historical.py b/etl/steps/data/garden/demography/2024-12-06/wittgenstein_human_capital_historical.py
@@ -2,7 +2,7 @@
 
 from etl.helpers import PathFinder, create_dataset
 
-from .shared import add_dim_some_education, add_prop, make_table
+from .shared import add_dim_15plus, add_dim_some_education, add_prop, make_table
 
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
@@ -105,6 +105,9 @@ def run(dest_dir: str) -> None:
     # Add education="some_education" (only for sex=total and age=total, and indicator 'pop')
     tb_sex_age_edu = add_dim_some_education(tb_sex_age_edu)
 
+    # Add 15+ age group
+    tb_sex_age_edu = add_dim_15plus(tb_sex_age_edu)
+
     # Add population share
     tb_sex_age_edu = add_prop(tb_sex_age_edu)
 

diff --git a/etl/steps/data/garden/demography/2024-12-06/wittgenstein_human_capital_proj.py b/etl/steps/data/garden/demography/2024-12-06/wittgenstein_human_capital_proj.py
@@ -2,7 +2,7 @@
 
 from etl.helpers import PathFinder, create_dataset
 
-from .shared import add_dim_some_education, add_prop, make_table
+from .shared import add_dim_15plus, add_dim_some_education, add_prop, make_table
 
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
@@ -105,6 +105,9 @@ def run(dest_dir: str) -> None:
     # Add education="some_education" (only for sex=total and age=total, and indicator 'pop')
     tb_sex_age_edu = add_dim_some_education(tb_sex_age_edu)
 
+    # Add 15+ age group
+    tb_sex_age_edu = add_dim_15plus(tb_sex_age_edu)
+
     # Add population share
     tb_sex_age_edu = add_prop(tb_sex_age_edu)
 

diff --git a/etl/steps/data/garden/education/2024-12-11/people_with_education.meta.yml b/etl/steps/data/garden/education/2024-12-11/people_with_education.meta.yml
@@ -0,0 +1,43 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    processing_level: major
+    presentation:
+      topic_tags:
+        - Global Education
+    display:
+      numDecimalPlaces: 1
+    short_unit: "%"
+    unit: "%"
+    description_key:
+      - Historical data for educational attainment between 1870 to 1950 comes from van Zanden, J. et al. (2014).
+      - "Data for 1950 to 2015 is sourced from the Wittgenstein Centre Human Capital Centre. These projections are based on collected census and survey data. The SSP2 is a middle-of-the-road scenario that combines medium fertility with medium mortality, medium migration, and the Global Education Trend (GET) education scenario. For more information and other projection models, consult the Wittgenstein Centre for Demography and Global Human Capital's website: https://dataexplorer.wittgensteincentre.org/."
+      - Data for 2020 onwards is also based on the Medium Shared Socioeconomic Pathways (SSP2) Wittgenstein Centre for Demography and Global Human Capital projections. For more information, see https://pure.iiasa.ac.at/id/eprint/19487/.
+
+dataset:
+  update_period_days: 365
+  title: People with formal basic education (Wittgenstein Centre, OECD)
+
+tables:
+  people_with_education:
+    variables:
+      no_basic_education:
+        title: Share of population with no education
+        description_short: Share of people  aged 15 or older who have not received some kind of formal [primary](#dod:primary-education), [secondary](#dod:secondary-education), or [tertiary](#dod:tertiary-education) education.
+        description_processing: |-
+          For each country and year, the share of the population aged 15 and older with no formal education was calculated. This involved summing up the population with no formal education and dividing it by the total population aged 15 and older for each country and year, then converting this ratio into a percentage.
+
+          A global estimate was calculated for each year by summing the total population aged 15 and older across all countries and the total population within this age group with no formal education. The share of the global population aged 15+ with no formal education was then computed for each year.
+
+          Historical data from van Zanden, J. et al. (2014) with estimates from 1870 to 1950 was combined with educational attainment estimates from Wittgenstein Centre for Demography and Global Human Capita.
+      basic_education:
+        title: Share of population with at least some basic education
+        description_short: Share of people  aged 15 or older who have received at least some kind of formal [primary](#dod:primary-education), [secondary](#dod:secondary-education), or [tertiary](#dod:tertiary-education) education.
+        description_processing: |-
+          For each country and year, the share of the population aged 15 and older with no formal education was calculated. This involved summing up the population with no formal education and dividing it by the total population aged 15 and older for each country and year, then converting this ratio into a percentage.
+
+          A global estimate was calculated for each year by summing the total population aged 15 and older across all countries and the total population within this age group with no formal education. The share of the global population aged 15+ with no formal education was then computed for each year.
+
+          Historical data from van Zanden, J. et al. (2014) with estimates from 1870 to 1950 was combined with educational attainment estimates from Wittgenstein Centre for Demography and Global Human Capita.
+
+          To calculate the share of the population with at least some basic education, the share of the population with no formal education was subtracted from 100%.
diff --git a/etl/steps/data/garden/education/2024-12-11/people_with_education.py b/etl/steps/data/garden/education/2024-12-11/people_with_education.py
@@ -0,0 +1,85 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load Wittgenstein data
+    ds_wc = paths.load_dataset("wittgenstein_human_capital")
+    tb_wc = ds_wc.read("by_sex_age_edu")
+
+    # Load garden historical OECD dataset.
+    ds_oecd = paths.load_dataset("oecd_education")
+    tb_oecd = ds_oecd.read("oecd_education")
+
+    #
+    # Process data.
+    #
+    # Prepare OECD
+    tb_oecd = make_oecd(tb_oecd)
+    countries_oecd = set(tb_oecd["country"].unique())
+
+    # Prepare Wittgenstein Center
+    tb_wc = make_wc(tb_wc)
+    countries_wc = set(tb_wc["country"].unique())
+
+    # Combine tables
+    tb = pr.concat([tb_oecd, tb_wc], short_name="education")
+    # Keep only relevant countries
+    countries = countries_oecd.intersection(countries_wc)
+    tb = tb.loc[tb["country"].isin(countries)]
+    # Format
+    tb = tb.format(["country", "year"], short_name="people_with_education")
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+
+def make_oecd(tb):
+    # Filter the for years above 2020 (New Wittgenstein Center data starts at 2020)
+    tb = tb.loc[
+        tb["year"] < 1950, ["country", "year", "no_formal_education", "population_with_basic_education"]
+    ].reset_index(drop=True)
+
+    # Rename columns
+    tb = tb.rename(
+        columns={
+            "no_formal_education": "no_basic_education",
+            "population_with_basic_education": "basic_education",
+        }
+    )
+    return tb
+
+
+def make_wc(tb):
+    tb = tb.loc[
+        (tb["scenario"] == 2)
+        # & (tb_wc["country"] == "World")
+        & (tb["sex"] == "total")
+        & (tb["age"] == "15+")
+        & (tb["education"].isin(["no_education"])),
+        ["country", "year", "prop"],
+    ]
+    assert tb.groupby(["country", "year"]).size().max() == 1, "Only 1 rows per country-year accepted"
+
+    # Estimate "no formal education"
+    tb = tb.rename(columns={"prop": "no_basic_education"})
+
+    # Estimate "with basic education"
+    tb["basic_education"] = 100 - tb["no_basic_education"]
+
+    return tb
diff --git a/etl/steps/data/grapher/education/2024-12-11/people_with_education.py b/etl/steps/data/grapher/education/2024-12-11/people_with_education.py
@@ -0,0 +1,32 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("people_with_education")
+
+    # Read table from garden dataset.
+    tables = list(ds_garden)
+
+    #
+    # Process data.
+    #
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()