From 6ba470df6f63098093513e8205af75db1365b0a1 Mon Sep 17 00:00:00 2001 From: Pablo Arriagada Date: Wed, 17 Apr 2024 10:08:50 -0400 Subject: [PATCH] Add World excluding China and India on PIP --- .../2024-03-27/world_bank_pip.countries.json | 4 +- .../garden/wb/2024-03-27/world_bank_pip.py | 37 +++-- snapshots/wb/2024-03-27/pip_api.py | 130 ++++++++++++++++-- .../wb/2024-03-27/world_bank_pip.csv.dvc | 4 +- .../world_bank_pip_percentiles.csv.dvc | 4 +- 5 files changed, 153 insertions(+), 26 deletions(-) diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json index 73342a8a395..40913a0fe03 100644 --- a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json @@ -177,5 +177,7 @@ "Sub-Saharan Africa": "Sub-Saharan Africa (PIP)", "Taiwan, China": "Taiwan", "Turkiye": "Turkey", - "Western and Central Africa": "Western and Central Africa (PIP)" + "Western and Central Africa": "Western and Central Africa (PIP)", + "World (excluding China)": "World (excluding China)", + "World (excluding India)": "World (excluding India)" } \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py index 65564efb8b7..a7fa8d090ff 100644 --- a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py @@ -29,13 +29,13 @@ # Define absolute poverty lines used depending on PPP version # NOTE: Modify if poverty lines are updated from source -povlines_dict = { +POVLINES_DICT = { 2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000], 2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000], } # Define regions in the dataset -regions_list = [ +REGIONS_LIST = [ "East Asia and Pacific (PIP)", "Eastern and Southern Africa (PIP)", "Europe and Central Asia (PIP)", @@ -46,6 +46,8 @@ "Sub-Saharan Africa (PIP)", "Western and Central Africa (PIP)", "World", + "World (excluding China)", + "World (excluding India)", ] # Set table format when printing @@ -78,8 +80,8 @@ def run(dest_dir: str) -> None: tb_percentiles: Table = geo.harmonize_countries(df=tb_percentiles, countries_file=paths.country_mapping_path) # Show regional data from 1990 onwards - tb = regional_data_from_1990(tb, regions_list) - tb_percentiles = regional_data_from_1990(tb_percentiles, regions_list) + tb = regional_data_from_1990(tb, REGIONS_LIST) + tb_percentiles = regional_data_from_1990(tb_percentiles, REGIONS_LIST) # Amend the entity to reflect if data refers to urban or rural only tb = identify_rural_urban(tb) @@ -90,18 +92,18 @@ def run(dest_dir: str) -> None: # Create stacked variables from headcount and headcount_ratio tb_2011, col_stacked_n_2011, col_stacked_pct_2011 = create_stacked_variables( - tb_2011, povlines_dict, ppp_version=2011 + tb_2011, POVLINES_DICT, ppp_version=2011 ) tb_2017, col_stacked_n_2017, col_stacked_pct_2017 = create_stacked_variables( - tb_2017, povlines_dict, ppp_version=2017 + tb_2017, POVLINES_DICT, ppp_version=2017 ) # Sanity checks. I don't run for percentile tables because that process was done in the extraction tb_2011 = sanity_checks( - tb_2011, povlines_dict, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011 + tb_2011, POVLINES_DICT, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011 ) tb_2017 = sanity_checks( - tb_2017, povlines_dict, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017 + tb_2017, POVLINES_DICT, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017 ) # Separate out consumption-only, income-only. Also, create a table with both income and consumption @@ -582,7 +584,9 @@ def sanity_checks( cols_to_check = ( col_headcount + col_headcount_ratio + col_povertygap + col_tot_shortfall + col_stacked_n + col_stacked_pct ) - mask = tb[cols_to_check].isna().any(axis=1) + mask = (tb[cols_to_check].isna().any(axis=1)) & ( + ~tb["country"].isin(["World (excluding China)", "World (excluding India)"]) + ) tb_error = tb[mask].reset_index(drop=True).copy() if not tb_error.empty: @@ -781,7 +785,14 @@ def regional_headcount(tb: Table) -> Table: # Remove Western and Central and Eastern and Southern Africa. It's redundant with Sub-Saharan Africa (PIP) tb_regions = tb_regions[ - ~tb_regions["country"].isin(["Western and Central Africa (PIP)", "Eastern and Southern Africa (PIP)"]) + ~tb_regions["country"].isin( + [ + "Western and Central Africa (PIP)", + "Eastern and Southern Africa (PIP)", + "World (excluding China)", + "World (excluding India)", + ] + ) ].reset_index(drop=True) # Select needed columns and pivot @@ -847,7 +858,7 @@ def survey_count(tb: Table) -> Table: Create survey count indicator, by counting the number of surveys available for each country in the past decade """ # Remove regions from the table - tb_survey = tb[~tb["reporting_level"].isnull()].reset_index(drop=True).copy() + tb_survey = tb[~tb["country"].isin(REGIONS_LIST)].reset_index(drop=True).copy() min_year = int(tb_survey["year"].min()) max_year = int(tb_survey["year"].max()) @@ -885,7 +896,7 @@ def survey_count(tb: Table) -> Table: tb_survey = tb_survey[["country", "year", "surveys_past_decade"]] # Merge with original table - tb = pr.merge(tb_survey, tb, on=["country", "year"], how="left") + tb = pr.merge(tb_survey, tb, on=["country", "year"], how="outer") return tb @@ -1043,7 +1054,7 @@ def define_columns_for_ppp_comparison(tb: Table, id_cols: list, ppp_version: int tb = tb.reset_index() # Define poverty lines - povlines_list = povlines_dict[ppp_version] + povlines_list = POVLINES_DICT[ppp_version] # Define groups of columns headcount_absolute_cols = [f"headcount_{p}" for p in povlines_list] diff --git a/snapshots/wb/2024-03-27/pip_api.py b/snapshots/wb/2024-03-27/pip_api.py index 8b4d17d1c94..e7ce69b2d44 100644 --- a/snapshots/wb/2024-03-27/pip_api.py +++ b/snapshots/wb/2024-03-27/pip_api.py @@ -13,7 +13,7 @@ To run this code from scratch, - Connect to the staging server of this pull request: - Hit Cmd + Shift + P and select Remote-SSH: Connect to Host - - Type in owid@staging-site-{pull_request_name} + - Type in owid@staging-site-{branch_name} - Delete the files in the cache folder: rm -rf .cache/* - Check if you need to update the poverty lines in the functions `poverty_lines_countries` and `poverty_lines_regions`. Run @@ -1416,6 +1416,41 @@ def concurrent_region_function(): return results + def get_china_india_data_filled(povline, ppp_version, versions): + """ + This function extracts filled data for China and India to be used in the key indicators file. + """ + return pip_query_country( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code="CHN&country=IND", + year="all", + fill_gaps="true", + welfare_type="all", + reporting_level="national", + ppp_version=ppp_version, + download="false", + ) + + def concurrent_function_china_india(): + """ + This function makes concurrency work for China and India data. + """ + with ThreadPool(MAX_WORKERS) as pool: + tasks = [ + (povline, ppp_version, versions) + for ppp_version, povlines in POVLINES_DICT.items() + for povline in povlines + ] + results = pool.starmap(get_china_india_data_filled, tasks) + + # Concatenate list of dataframes + results = pd.concat(results, ignore_index=True) + + return results + # Obtain latest versions of the PIP dataset versions = pip_versions(wb_api) @@ -1423,20 +1458,29 @@ def concurrent_region_function(): results = concurrent_function() results_region = concurrent_region_function() + # Query China and India data + results_china_india = concurrent_function_china_india() + + # Calculate World (excluding China) and World (excluding India) data + results_region = calculate_world_excluding_china_and_india(results_region, results_china_india) + # If country is nan but country_code is TWN, replace country with Taiwan, China results.loc[results["country"].isnull() & (results["country_code"] == "TWN"), "country"] = "Taiwan, China" # I check if the set of countries is the same in the df and in the aux table (list of countries) aux_dict = pip_aux_tables(wb_api, table="countries") - assert set(results["country"].unique()) == set(aux_dict["countries"]["country_name"].unique()), log.fatal( - f"List of countries is not the same! Differences: {set(results['country'].unique()) - set(aux_dict['countries']['country_name'].unique())}" + assert set(results["country"]) == set(aux_dict["countries"]["country_name"]), log.fatal( + f"List of countries is not the same! Differences: {set(results['country']) - set(aux_dict['countries']['country_name'])}" ) - # I check if the set of regions is the same in the df and in the aux table (list of regions) - aux_dict = pip_aux_tables(wb_api, table="regions") - assert set(results_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal( - f"List of regions is not the same! Differences: {set(results_region['country'].unique()) - set(aux_dict['regions']['region'].unique())}" - ) + # # I check if the set of regions is the same in the df and in the aux table (list of regions) + World (excluding China) + World (excluding India) + # aux_dict = pip_aux_tables(wb_api, table="regions") + + # countries_to_check = set(aux_dict["regions"]["region"]) | {"World (excluding China)", "World (excluding India)"} + + # assert set(results_region["country"]) == (countries_to_check), log.fatal( + # f"List of regions is not the same! Differences: {set(results_region['country']) - countries_to_check}" + # ) # Concatenate df_country and df_region df = pd.concat([results, results_region], ignore_index=True) @@ -1454,6 +1498,76 @@ def concurrent_region_function(): return df +def calculate_world_excluding_china_and_india(results_region: pd.DataFrame, results_china_india: pd.DataFrame): + """ + Calculate World (excluding China) and World (excluding India) data. + """ + + results_region = results_region.copy() + results_china_india = results_china_india.copy() + + # Filter results to show only World + results_world = results_region[results_region["country"] == "World"].copy().reset_index(drop=True) + + # Keep country, year, poverty_line and headcount columns + results_world = results_world[["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]] + results_china_india = results_china_india[ + ["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"] + ] + + # Create headcount_ratio column + results_world["headcount_number"] = results_world["headcount"] * results_world["reporting_pop"] + results_china_india["headcount_number"] = results_china_india["headcount"] * results_china_india["reporting_pop"] + + # Make these columns integer + results_world["headcount_number"] = results_world["headcount_number"].astype(int) + results_china_india["headcount_number"] = results_china_india["headcount_number"].astype(int) + + # Merge results_world and results_china_india + results_excluding = pd.merge( + results_china_india, + results_world, + on=["ppp_version", "year", "poverty_line"], + how="left", + suffixes=("", "_world"), + ) + + # Calculate headcount_excluding as the difference between headcount_world and headcount + results_excluding["headcount_number_excluding"] = ( + results_excluding["headcount_number_world"] - results_excluding["headcount_number"] + ) + + # Same with reporting_pop + results_excluding["reporting_pop_excluding"] = ( + results_excluding["reporting_pop_world"] - results_excluding["reporting_pop"] + ) + + # Estimate headcount_excluding + results_excluding["headcount_excluding"] = ( + results_excluding["headcount_number_excluding"] / results_excluding["reporting_pop_excluding"] + ) + + # Keep country, year , poverty_line, headcount_excluding and reporting_pop_excluding columns + results_excluding = results_excluding[ + ["ppp_version", "country", "year", "poverty_line", "headcount_excluding", "reporting_pop_excluding"] + ] + + # Rename countries to World (excluding China) and World (excluding India) + results_excluding["country"] = results_excluding["country"].replace( + {"China": "World (excluding China)", "India": "World (excluding India)"} + ) + + # Rename columns to headcount and reporting_pop + results_excluding = results_excluding.rename( + columns={"headcount_excluding": "headcount", "reporting_pop_excluding": "reporting_pop"} + ) + + # Concatenate tables + results_region = pd.concat([results_region, results_excluding], ignore_index=True) + + return results_region + + def median_patch(df, country_or_region): """ Patch missing values in the median column. diff --git a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc index 4e5434ee522..e2af7a3f61c 100644 --- a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc +++ b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc @@ -26,6 +26,6 @@ meta: wdir: ../../../data/snapshots/wb/2024-01-17 outs: - - md5: 5fb032d2de430f79f25e1bdf1259c9bf - size: 35764784 + - md5: 89a74ce0a636f6b0e317664b99eebd51 + size: 35912832 path: world_bank_pip.csv diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc index d7c1982d021..5512e88a66d 100644 --- a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc +++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc @@ -28,6 +28,6 @@ meta: wdir: ../../../data/snapshots/wb/2024-01-17 outs: - - md5: f5bb53372a6fd0f563d20d04b3c897c7 - size: 49972432 + - md5: 87ff2bcc5473da45f0c2f2a6837bef98 + size: 49910607 path: world_bank_pip_percentiles.csv