From 911d6f23f40527f6a30cf0f40b39cf85be85a9e6 Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Sat, 23 Nov 2024 11:13:00 +0100 Subject: [PATCH 1/4] education: EdStats old version check --- dag/education.yml | 14 +++++++++++++- .../education/2023-12-15/wittgenstein_center.py | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/dag/education.yml b/dag/education.yml index 04fd9587abe..ec16cf7b226 100644 --- a/dag/education.yml +++ b/dag/education.yml @@ -1,4 +1,16 @@ steps: + # World Bank EdStats + data://meadow/wb/2023-07-10/education: + - snapshot://wb/2023-07-10/education.csv + + data://garden/wb/2023-07-10/education: + - data://meadow/wb/2023-07-10/education + - data://garden/education/2017-09-30/public_expenditure + - data://garden/education/2018-04-18/literacy_rates + + data://grapher/wb/2023-07-10/education: + - data://garden/wb/2023-07-10/education + - # Barro and Lee projections dataset data://meadow/education/2023-07-17/education_barro_lee_projections: - snapshot://education/2023-07-17/education_barro_lee_projections.csv @@ -41,7 +53,7 @@ steps: - snapshot://education/2023-08-14/oecd_education.csv data://garden/education/2023-08-14/oecd_education: - data://meadow/education/2023-08-14/oecd_education - - data://garden/wb/2024-11-04/edstats + - data://garden/wb/2023-07-10/education data://grapher/education/2023-08-14/oecd_education: - data://garden/education/2023-08-14/oecd_education diff --git a/etl/steps/data/garden/education/2023-12-15/wittgenstein_center.py b/etl/steps/data/garden/education/2023-12-15/wittgenstein_center.py index e911deeed50..443b29a4ca3 100644 --- a/etl/steps/data/garden/education/2023-12-15/wittgenstein_center.py +++ b/etl/steps/data/garden/education/2023-12-15/wittgenstein_center.py @@ -23,6 +23,7 @@ def run(dest_dir: str) -> None: ds_oecd = paths.load_dataset("oecd_education") tb_oecd = ds_oecd["oecd_education"].reset_index() tb_oecd_formal_ed = tb_oecd[["country", "year", "no_formal_education", "population_with_basic_education"]] + print(tb_oecd_formal_ed) # Filter the for years above 2020 (New Wittgenstein Center data starts at 2020) tb_below_2020 = tb_oecd_formal_ed[tb_oecd_formal_ed["year"] < 2020].reset_index(drop=True) From 3ff2a68192256e0cf1cf99ac6564a37faaed8ca0 Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Sat, 23 Nov 2024 11:20:34 +0100 Subject: [PATCH 2/4] bug --- dag/education.yml | 4 +--- etl/steps/data/garden/education/2023-08-14/oecd_education.py | 4 ++-- .../data/garden/education/2023-12-15/wittgenstein_center.py | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/dag/education.yml b/dag/education.yml index ec16cf7b226..3cd6ef48d22 100644 --- a/dag/education.yml +++ b/dag/education.yml @@ -2,15 +2,13 @@ steps: # World Bank EdStats data://meadow/wb/2023-07-10/education: - snapshot://wb/2023-07-10/education.csv - data://garden/wb/2023-07-10/education: - data://meadow/wb/2023-07-10/education - data://garden/education/2017-09-30/public_expenditure - data://garden/education/2018-04-18/literacy_rates - data://grapher/wb/2023-07-10/education: - data://garden/wb/2023-07-10/education - - + # Barro and Lee projections dataset data://meadow/education/2023-07-17/education_barro_lee_projections: - snapshot://education/2023-07-17/education_barro_lee_projections.csv diff --git a/etl/steps/data/garden/education/2023-08-14/oecd_education.py b/etl/steps/data/garden/education/2023-08-14/oecd_education.py index 0f55dd9cea9..62aa1f91c70 100644 --- a/etl/steps/data/garden/education/2023-08-14/oecd_education.py +++ b/etl/steps/data/garden/education/2023-08-14/oecd_education.py @@ -23,8 +23,8 @@ def run(dest_dir: str) -> None: tb = ds_meadow["oecd_education"].reset_index() # Load the World Bank Education Dataset - ds_garden_wb = paths.load_dataset("edstats") - tb_wb = ds_garden_wb["edstats"].reset_index() + ds_garden_wb = paths.load_dataset("education") + tb_wb = ds_garden_wb["education"].reset_index() # Harmonize country names tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) diff --git a/etl/steps/data/garden/education/2023-12-15/wittgenstein_center.py b/etl/steps/data/garden/education/2023-12-15/wittgenstein_center.py index 443b29a4ca3..e911deeed50 100644 --- a/etl/steps/data/garden/education/2023-12-15/wittgenstein_center.py +++ b/etl/steps/data/garden/education/2023-12-15/wittgenstein_center.py @@ -23,7 +23,6 @@ def run(dest_dir: str) -> None: ds_oecd = paths.load_dataset("oecd_education") tb_oecd = ds_oecd["oecd_education"].reset_index() tb_oecd_formal_ed = tb_oecd[["country", "year", "no_formal_education", "population_with_basic_education"]] - print(tb_oecd_formal_ed) # Filter the for years above 2020 (New Wittgenstein Center data starts at 2020) tb_below_2020 = tb_oecd_formal_ed[tb_oecd_formal_ed["year"] < 2020].reset_index(drop=True) From a81a1f6a16cfe20e1712bdecec00b939b0372d23 Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Sat, 23 Nov 2024 22:23:38 +0100 Subject: [PATCH 3/4] bug in no formal education data (from provider) --- dag/education.yml | 10 ---------- etl/steps/data/garden/wb/2024-11-04/edstats.py | 4 ++++ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/dag/education.yml b/dag/education.yml index 3cd6ef48d22..6c0be1a973c 100644 --- a/dag/education.yml +++ b/dag/education.yml @@ -1,14 +1,4 @@ steps: - # World Bank EdStats - data://meadow/wb/2023-07-10/education: - - snapshot://wb/2023-07-10/education.csv - data://garden/wb/2023-07-10/education: - - data://meadow/wb/2023-07-10/education - - data://garden/education/2017-09-30/public_expenditure - - data://garden/education/2018-04-18/literacy_rates - data://grapher/wb/2023-07-10/education: - - data://garden/wb/2023-07-10/education - # Barro and Lee projections dataset data://meadow/education/2023-07-17/education_barro_lee_projections: - snapshot://education/2023-07-17/education_barro_lee_projections.csv diff --git a/etl/steps/data/garden/wb/2024-11-04/edstats.py b/etl/steps/data/garden/wb/2024-11-04/edstats.py index bd365d12df7..c5d89aa8b88 100644 --- a/etl/steps/data/garden/wb/2024-11-04/edstats.py +++ b/etl/steps/data/garden/wb/2024-11-04/edstats.py @@ -57,6 +57,10 @@ def run(dest_dir: str) -> None: # Rename columns based on metadata tb = rename_columns(tb, metadata_tb) + # Convert the share of the population with no education to a percentage (bug in the data) + tb[ + "wittgenstein_projection__percentage_of_the_population_age_15plus_by_highest_level_of_educational_attainment__no_education__total" + ] *= 100 tb = tb.format(["country", "year"]) # From e37cd37a0fa467767137445c67fbcb2f6699ac10 Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Sat, 23 Nov 2024 22:25:10 +0100 Subject: [PATCH 4/4] fix dependencies --- dag/education.yml | 2 +- etl/steps/data/garden/education/2023-08-14/oecd_education.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dag/education.yml b/dag/education.yml index 6c0be1a973c..04fd9587abe 100644 --- a/dag/education.yml +++ b/dag/education.yml @@ -41,7 +41,7 @@ steps: - snapshot://education/2023-08-14/oecd_education.csv data://garden/education/2023-08-14/oecd_education: - data://meadow/education/2023-08-14/oecd_education - - data://garden/wb/2023-07-10/education + - data://garden/wb/2024-11-04/edstats data://grapher/education/2023-08-14/oecd_education: - data://garden/education/2023-08-14/oecd_education diff --git a/etl/steps/data/garden/education/2023-08-14/oecd_education.py b/etl/steps/data/garden/education/2023-08-14/oecd_education.py index 62aa1f91c70..0f55dd9cea9 100644 --- a/etl/steps/data/garden/education/2023-08-14/oecd_education.py +++ b/etl/steps/data/garden/education/2023-08-14/oecd_education.py @@ -23,8 +23,8 @@ def run(dest_dir: str) -> None: tb = ds_meadow["oecd_education"].reset_index() # Load the World Bank Education Dataset - ds_garden_wb = paths.load_dataset("education") - tb_wb = ds_garden_wb["education"].reset_index() + ds_garden_wb = paths.load_dataset("edstats") + tb_wb = ds_garden_wb["edstats"].reset_index() # Harmonize country names tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)