Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ add World excluding China and India data for World Bank PIP #2528

Merged
merged 1 commit into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -177,5 +177,7 @@
"Sub-Saharan Africa": "Sub-Saharan Africa (PIP)",
"Taiwan, China": "Taiwan",
"Turkiye": "Turkey",
"Western and Central Africa": "Western and Central Africa (PIP)"
"Western and Central Africa": "Western and Central Africa (PIP)",
"World (excluding China)": "World (excluding China)",
"World (excluding India)": "World (excluding India)"
}
37 changes: 24 additions & 13 deletions etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@

# Define absolute poverty lines used depending on PPP version
# NOTE: Modify if poverty lines are updated from source
povlines_dict = {
POVLINES_DICT = {
2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000],
2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000],
}

# Define regions in the dataset
regions_list = [
REGIONS_LIST = [
"East Asia and Pacific (PIP)",
"Eastern and Southern Africa (PIP)",
"Europe and Central Asia (PIP)",
Expand All @@ -46,6 +46,8 @@
"Sub-Saharan Africa (PIP)",
"Western and Central Africa (PIP)",
"World",
"World (excluding China)",
"World (excluding India)",
]

# Set table format when printing
Expand Down Expand Up @@ -78,8 +80,8 @@ def run(dest_dir: str) -> None:
tb_percentiles: Table = geo.harmonize_countries(df=tb_percentiles, countries_file=paths.country_mapping_path)

# Show regional data from 1990 onwards
tb = regional_data_from_1990(tb, regions_list)
tb_percentiles = regional_data_from_1990(tb_percentiles, regions_list)
tb = regional_data_from_1990(tb, REGIONS_LIST)
tb_percentiles = regional_data_from_1990(tb_percentiles, REGIONS_LIST)

# Amend the entity to reflect if data refers to urban or rural only
tb = identify_rural_urban(tb)
Expand All @@ -90,18 +92,18 @@ def run(dest_dir: str) -> None:

# Create stacked variables from headcount and headcount_ratio
tb_2011, col_stacked_n_2011, col_stacked_pct_2011 = create_stacked_variables(
tb_2011, povlines_dict, ppp_version=2011
tb_2011, POVLINES_DICT, ppp_version=2011
)
tb_2017, col_stacked_n_2017, col_stacked_pct_2017 = create_stacked_variables(
tb_2017, povlines_dict, ppp_version=2017
tb_2017, POVLINES_DICT, ppp_version=2017
)

# Sanity checks. I don't run for percentile tables because that process was done in the extraction
tb_2011 = sanity_checks(
tb_2011, povlines_dict, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
tb_2011, POVLINES_DICT, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
)
tb_2017 = sanity_checks(
tb_2017, povlines_dict, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
tb_2017, POVLINES_DICT, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
)

# Separate out consumption-only, income-only. Also, create a table with both income and consumption
Expand Down Expand Up @@ -582,7 +584,9 @@ def sanity_checks(
cols_to_check = (
col_headcount + col_headcount_ratio + col_povertygap + col_tot_shortfall + col_stacked_n + col_stacked_pct
)
mask = tb[cols_to_check].isna().any(axis=1)
mask = (tb[cols_to_check].isna().any(axis=1)) & (
~tb["country"].isin(["World (excluding China)", "World (excluding India)"])
)
tb_error = tb[mask].reset_index(drop=True).copy()

if not tb_error.empty:
Expand Down Expand Up @@ -781,7 +785,14 @@ def regional_headcount(tb: Table) -> Table:

# Remove Western and Central and Eastern and Southern Africa. It's redundant with Sub-Saharan Africa (PIP)
tb_regions = tb_regions[
~tb_regions["country"].isin(["Western and Central Africa (PIP)", "Eastern and Southern Africa (PIP)"])
~tb_regions["country"].isin(
[
"Western and Central Africa (PIP)",
"Eastern and Southern Africa (PIP)",
"World (excluding China)",
"World (excluding India)",
]
)
].reset_index(drop=True)

# Select needed columns and pivot
Expand Down Expand Up @@ -847,7 +858,7 @@ def survey_count(tb: Table) -> Table:
Create survey count indicator, by counting the number of surveys available for each country in the past decade
"""
# Remove regions from the table
tb_survey = tb[~tb["reporting_level"].isnull()].reset_index(drop=True).copy()
tb_survey = tb[~tb["country"].isin(REGIONS_LIST)].reset_index(drop=True).copy()

min_year = int(tb_survey["year"].min())
max_year = int(tb_survey["year"].max())
Expand Down Expand Up @@ -885,7 +896,7 @@ def survey_count(tb: Table) -> Table:
tb_survey = tb_survey[["country", "year", "surveys_past_decade"]]

# Merge with original table
tb = pr.merge(tb_survey, tb, on=["country", "year"], how="left")
tb = pr.merge(tb_survey, tb, on=["country", "year"], how="outer")

return tb

Expand Down Expand Up @@ -1043,7 +1054,7 @@ def define_columns_for_ppp_comparison(tb: Table, id_cols: list, ppp_version: int

tb = tb.reset_index()
# Define poverty lines
povlines_list = povlines_dict[ppp_version]
povlines_list = POVLINES_DICT[ppp_version]

# Define groups of columns
headcount_absolute_cols = [f"headcount_{p}" for p in povlines_list]
Expand Down
130 changes: 122 additions & 8 deletions snapshots/wb/2024-03-27/pip_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
To run this code from scratch,
- Connect to the staging server of this pull request:
- Hit Cmd + Shift + P and select Remote-SSH: Connect to Host
- Type in owid@staging-site-{pull_request_name}
- Type in owid@staging-site-{branch_name}
- Delete the files in the cache folder:
rm -rf .cache/*
- Check if you need to update the poverty lines in the functions `poverty_lines_countries` and `poverty_lines_regions`. Run
Expand Down Expand Up @@ -1416,27 +1416,71 @@ def concurrent_region_function():

return results

def get_china_india_data_filled(povline, ppp_version, versions):
"""
This function extracts filled data for China and India to be used in the key indicators file.
"""
return pip_query_country(
wb_api,
popshare_or_povline="povline",
value=povline / 100,
versions=versions,
country_code="CHN&country=IND",
year="all",
fill_gaps="true",
welfare_type="all",
reporting_level="national",
ppp_version=ppp_version,
download="false",
)

def concurrent_function_china_india():
"""
This function makes concurrency work for China and India data.
"""
with ThreadPool(MAX_WORKERS) as pool:
tasks = [
(povline, ppp_version, versions)
for ppp_version, povlines in POVLINES_DICT.items()
for povline in povlines
]
results = pool.starmap(get_china_india_data_filled, tasks)

# Concatenate list of dataframes
results = pd.concat(results, ignore_index=True)

return results

# Obtain latest versions of the PIP dataset
versions = pip_versions(wb_api)

# Run the main function
results = concurrent_function()
results_region = concurrent_region_function()

# Query China and India data
results_china_india = concurrent_function_china_india()

# Calculate World (excluding China) and World (excluding India) data
results_region = calculate_world_excluding_china_and_india(results_region, results_china_india)

# If country is nan but country_code is TWN, replace country with Taiwan, China
results.loc[results["country"].isnull() & (results["country_code"] == "TWN"), "country"] = "Taiwan, China"

# I check if the set of countries is the same in the df and in the aux table (list of countries)
aux_dict = pip_aux_tables(wb_api, table="countries")
assert set(results["country"].unique()) == set(aux_dict["countries"]["country_name"].unique()), log.fatal(
f"List of countries is not the same! Differences: {set(results['country'].unique()) - set(aux_dict['countries']['country_name'].unique())}"
assert set(results["country"]) == set(aux_dict["countries"]["country_name"]), log.fatal(
f"List of countries is not the same! Differences: {set(results['country']) - set(aux_dict['countries']['country_name'])}"
)

# I check if the set of regions is the same in the df and in the aux table (list of regions)
aux_dict = pip_aux_tables(wb_api, table="regions")
assert set(results_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal(
f"List of regions is not the same! Differences: {set(results_region['country'].unique()) - set(aux_dict['regions']['region'].unique())}"
)
# # I check if the set of regions is the same in the df and in the aux table (list of regions) + World (excluding China) + World (excluding India)
# aux_dict = pip_aux_tables(wb_api, table="regions")

# countries_to_check = set(aux_dict["regions"]["region"]) | {"World (excluding China)", "World (excluding India)"}

# assert set(results_region["country"]) == (countries_to_check), log.fatal(
# f"List of regions is not the same! Differences: {set(results_region['country']) - countries_to_check}"
# )

# Concatenate df_country and df_region
df = pd.concat([results, results_region], ignore_index=True)
Expand All @@ -1454,6 +1498,76 @@ def concurrent_region_function():
return df


def calculate_world_excluding_china_and_india(results_region: pd.DataFrame, results_china_india: pd.DataFrame):
"""
Calculate World (excluding China) and World (excluding India) data.
"""

results_region = results_region.copy()
results_china_india = results_china_india.copy()

# Filter results to show only World
results_world = results_region[results_region["country"] == "World"].copy().reset_index(drop=True)

# Keep country, year, poverty_line and headcount columns
results_world = results_world[["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]]
results_china_india = results_china_india[
["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]
]

# Create headcount_ratio column
results_world["headcount_number"] = results_world["headcount"] * results_world["reporting_pop"]
results_china_india["headcount_number"] = results_china_india["headcount"] * results_china_india["reporting_pop"]

# Make these columns integer
results_world["headcount_number"] = results_world["headcount_number"].astype(int)
results_china_india["headcount_number"] = results_china_india["headcount_number"].astype(int)

# Merge results_world and results_china_india
results_excluding = pd.merge(
results_china_india,
results_world,
on=["ppp_version", "year", "poverty_line"],
how="left",
suffixes=("", "_world"),
)

# Calculate headcount_excluding as the difference between headcount_world and headcount
results_excluding["headcount_number_excluding"] = (
results_excluding["headcount_number_world"] - results_excluding["headcount_number"]
)

# Same with reporting_pop
results_excluding["reporting_pop_excluding"] = (
results_excluding["reporting_pop_world"] - results_excluding["reporting_pop"]
)

# Estimate headcount_excluding
results_excluding["headcount_excluding"] = (
results_excluding["headcount_number_excluding"] / results_excluding["reporting_pop_excluding"]
)

# Keep country, year , poverty_line, headcount_excluding and reporting_pop_excluding columns
results_excluding = results_excluding[
["ppp_version", "country", "year", "poverty_line", "headcount_excluding", "reporting_pop_excluding"]
]

# Rename countries to World (excluding China) and World (excluding India)
results_excluding["country"] = results_excluding["country"].replace(
{"China": "World (excluding China)", "India": "World (excluding India)"}
)

# Rename columns to headcount and reporting_pop
results_excluding = results_excluding.rename(
columns={"headcount_excluding": "headcount", "reporting_pop_excluding": "reporting_pop"}
)

# Concatenate tables
results_region = pd.concat([results_region, results_excluding], ignore_index=True)

return results_region


def median_patch(df, country_or_region):
"""
Patch missing values in the median column.
Expand Down
4 changes: 2 additions & 2 deletions snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@ meta:

wdir: ../../../data/snapshots/wb/2024-01-17
outs:
- md5: 5fb032d2de430f79f25e1bdf1259c9bf
size: 35764784
- md5: 89a74ce0a636f6b0e317664b99eebd51
size: 35912832
path: world_bank_pip.csv
4 changes: 2 additions & 2 deletions snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ meta:

wdir: ../../../data/snapshots/wb/2024-01-17
outs:
- md5: f5bb53372a6fd0f563d20d04b3c897c7
size: 49972432
- md5: 87ff2bcc5473da45f0c2f2a6837bef98
size: 49910607
path: world_bank_pip_percentiles.csv
Loading