diff --git a/etl/steps/data/garden/oecd/2024-08-21/official_development_assistance.py b/etl/steps/data/garden/oecd/2024-08-21/official_development_assistance.py index 4fd0da1e0fb..416a1eca08f 100644 --- a/etl/steps/data/garden/oecd/2024-08-21/official_development_assistance.py +++ b/etl/steps/data/garden/oecd/2024-08-21/official_development_assistance.py @@ -479,38 +479,6 @@ def add_aid_by_channel(tb: Table, tb_channels_donor: Table, tb_channels_recipien return tb -def remove_jumps_in_the_data_and_unneeded_cols(tb: Table) -> Table: - """ - Remove jumps in the data generated by own calculation. - This is most likely because of aggregations of population and GNI not properly done by the source. - This is a temporary solution until the source fixes the data. It is already reported. - - Also, remove redundant columns. - """ - - # For i_oda_net_disbursements_share_gni - tb.loc[ - (tb["country"] == "Non-DAC countries (OECD)"), - "i_oda_net_disbursements_share_gni", - ] = None - - # For i_oda_net_disbursements_per_capita - tb.loc[ - (tb["country"] == "Non-DAC countries (OECD)") & (tb["year"] == 2007), "i_oda_net_disbursements_per_capita" - ] = None - - # Remove rows where country = Non-DAC countries (OECD) and year 2023 - # This is because the data is not complete until December 2024 - tb = tb[~((tb["country"] == "Non-DAC countries (OECD)") & (tb["year"] == 2023))].reset_index(drop=True) - - # Remove columns - tb = tb.drop( - columns=["oda_bilateral_2_grant_equivalents", "oda_multilateral_2_grant_equivalents", "i_oda_grant_equivalents"] - ) - - return tb - - def limit_grant_equivalents_from_2018_only(tb: Table) -> Table: """ Limit grant equivalent indicators from year 2018 onwards. @@ -541,3 +509,35 @@ def combine_net_and_grant_equivalents(tb: Table) -> Table: tb.loc[tb["year"] < 2018, "oda_official_estimate_share_gni"] = tb["i_oda_net_disbursements_share_gni"] return tb + + +def remove_jumps_in_the_data_and_unneeded_cols(tb: Table) -> Table: + """ + Remove jumps in the data generated by own calculation. + This is most likely because of aggregations of population and GNI not properly done by the source. + This is a temporary solution until the source fixes the data. It is already reported. + + Also, remove redundant columns. + """ + + # For i_oda_net_disbursements_share_gni + tb.loc[ + (tb["country"] == "Non-DAC countries (OECD)"), + "i_oda_net_disbursements_share_gni", + ] = None + + # For i_oda_net_disbursements_per_capita + tb.loc[ + (tb["country"] == "Non-DAC countries (OECD)") & (tb["year"] == 2007), "i_oda_net_disbursements_per_capita" + ] = None + + # Remove rows where country = Non-DAC countries (OECD) and year 2023 + # This is because the data is not complete until December 2024 + tb = tb[~((tb["country"] == "Non-DAC countries (OECD)") & (tb["year"] == 2023))].reset_index(drop=True) + + # Remove columns + tb = tb.drop( + columns=["oda_bilateral_2_grant_equivalents", "oda_multilateral_2_grant_equivalents", "i_oda_grant_equivalents"] + ) + + return tb diff --git a/etl/steps/data/garden/one/2024-10-02/official_development_assistance_one.py b/etl/steps/data/garden/one/2024-10-02/official_development_assistance_one.py index e6ad42b4d53..1e7bca8f525 100644 --- a/etl/steps/data/garden/one/2024-10-02/official_development_assistance_one.py +++ b/etl/steps/data/garden/one/2024-10-02/official_development_assistance_one.py @@ -1,3 +1,4 @@ +# NOTE: After December 2024 update, check the steps in `remove_data_for_most_recent_year` """Load a meadow dataset and create a garden dataset.""" import owid.catalog.processing as pr @@ -13,6 +14,9 @@ INDEX_SECTORS = ["donor_name", "recipient_name", "year", "sector_name"] INDEX_CHANNELS = ["donor_name", "recipient_name", "year", "channel_name"] +# Define most recent year in the data +MOST_RECENT_YEAR = 2023 + # Define mapping for sectors, including new names, sub-sectors, and sectors. SECTORS_MAPPING = { "I.1.a. Education, Level Unspecified": { @@ -254,7 +258,7 @@ "4": "Multilateral organizations", "5": "University, college or other teaching institution, research institute or think-tank", "6": "Private sector institutions", - "9": "Other", + "9": "Unspecified", } # Define multiplier for values @@ -275,6 +279,10 @@ def run(dest_dir: str) -> None: # # Process data. # + # Remove data for the most recent year. + tb_sectors = remove_data_for_most_recent_year(tb=tb_sectors, year=MOST_RECENT_YEAR) + tb_channels = remove_data_for_most_recent_year(tb=tb_channels, year=MOST_RECENT_YEAR) + tb_sectors = geo.harmonize_countries( df=tb_sectors, country_col="donor_name", @@ -486,3 +494,14 @@ def rename_and_aggregate_channels(tb: Table) -> Table: tb = tb.groupby(INDEX_CHANNELS, observed=True, dropna=False)["value"].sum().reset_index() return tb + + +def remove_data_for_most_recent_year(tb: Table, year: int) -> Table: + """ + Remove data for the most recent year. + """ + + # Filter the table to remove the most recent year + tb = tb[tb["year"] != year].reset_index(drop=True) + + return tb