Skip to content

Commit

Permalink
Add World excluding China and India on PIP
Browse files Browse the repository at this point in the history
  • Loading branch information
paarriagadap committed Apr 17, 2024
1 parent b93df70 commit 6ba470d
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -177,5 +177,7 @@
"Sub-Saharan Africa": "Sub-Saharan Africa (PIP)",
"Taiwan, China": "Taiwan",
"Turkiye": "Turkey",
"Western and Central Africa": "Western and Central Africa (PIP)"
"Western and Central Africa": "Western and Central Africa (PIP)",
"World (excluding China)": "World (excluding China)",
"World (excluding India)": "World (excluding India)"
}
37 changes: 24 additions & 13 deletions etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@

# Define absolute poverty lines used depending on PPP version
# NOTE: Modify if poverty lines are updated from source
povlines_dict = {
POVLINES_DICT = {
2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000],
2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000],
}

# Define regions in the dataset
regions_list = [
REGIONS_LIST = [
"East Asia and Pacific (PIP)",
"Eastern and Southern Africa (PIP)",
"Europe and Central Asia (PIP)",
Expand All @@ -46,6 +46,8 @@
"Sub-Saharan Africa (PIP)",
"Western and Central Africa (PIP)",
"World",
"World (excluding China)",
"World (excluding India)",
]

# Set table format when printing
Expand Down Expand Up @@ -78,8 +80,8 @@ def run(dest_dir: str) -> None:
tb_percentiles: Table = geo.harmonize_countries(df=tb_percentiles, countries_file=paths.country_mapping_path)

# Show regional data from 1990 onwards
tb = regional_data_from_1990(tb, regions_list)
tb_percentiles = regional_data_from_1990(tb_percentiles, regions_list)
tb = regional_data_from_1990(tb, REGIONS_LIST)
tb_percentiles = regional_data_from_1990(tb_percentiles, REGIONS_LIST)

# Amend the entity to reflect if data refers to urban or rural only
tb = identify_rural_urban(tb)
Expand All @@ -90,18 +92,18 @@ def run(dest_dir: str) -> None:

# Create stacked variables from headcount and headcount_ratio
tb_2011, col_stacked_n_2011, col_stacked_pct_2011 = create_stacked_variables(
tb_2011, povlines_dict, ppp_version=2011
tb_2011, POVLINES_DICT, ppp_version=2011
)
tb_2017, col_stacked_n_2017, col_stacked_pct_2017 = create_stacked_variables(
tb_2017, povlines_dict, ppp_version=2017
tb_2017, POVLINES_DICT, ppp_version=2017
)

# Sanity checks. I don't run for percentile tables because that process was done in the extraction
tb_2011 = sanity_checks(
tb_2011, povlines_dict, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
tb_2011, POVLINES_DICT, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
)
tb_2017 = sanity_checks(
tb_2017, povlines_dict, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
tb_2017, POVLINES_DICT, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
)

# Separate out consumption-only, income-only. Also, create a table with both income and consumption
Expand Down Expand Up @@ -582,7 +584,9 @@ def sanity_checks(
cols_to_check = (
col_headcount + col_headcount_ratio + col_povertygap + col_tot_shortfall + col_stacked_n + col_stacked_pct
)
mask = tb[cols_to_check].isna().any(axis=1)
mask = (tb[cols_to_check].isna().any(axis=1)) & (
~tb["country"].isin(["World (excluding China)", "World (excluding India)"])
)
tb_error = tb[mask].reset_index(drop=True).copy()

if not tb_error.empty:
Expand Down Expand Up @@ -781,7 +785,14 @@ def regional_headcount(tb: Table) -> Table:

# Remove Western and Central and Eastern and Southern Africa. It's redundant with Sub-Saharan Africa (PIP)
tb_regions = tb_regions[
~tb_regions["country"].isin(["Western and Central Africa (PIP)", "Eastern and Southern Africa (PIP)"])
~tb_regions["country"].isin(
[
"Western and Central Africa (PIP)",
"Eastern and Southern Africa (PIP)",
"World (excluding China)",
"World (excluding India)",
]
)
].reset_index(drop=True)

# Select needed columns and pivot
Expand Down Expand Up @@ -847,7 +858,7 @@ def survey_count(tb: Table) -> Table:
Create survey count indicator, by counting the number of surveys available for each country in the past decade
"""
# Remove regions from the table
tb_survey = tb[~tb["reporting_level"].isnull()].reset_index(drop=True).copy()
tb_survey = tb[~tb["country"].isin(REGIONS_LIST)].reset_index(drop=True).copy()

min_year = int(tb_survey["year"].min())
max_year = int(tb_survey["year"].max())
Expand Down Expand Up @@ -885,7 +896,7 @@ def survey_count(tb: Table) -> Table:
tb_survey = tb_survey[["country", "year", "surveys_past_decade"]]

# Merge with original table
tb = pr.merge(tb_survey, tb, on=["country", "year"], how="left")
tb = pr.merge(tb_survey, tb, on=["country", "year"], how="outer")

return tb

Expand Down Expand Up @@ -1043,7 +1054,7 @@ def define_columns_for_ppp_comparison(tb: Table, id_cols: list, ppp_version: int

tb = tb.reset_index()
# Define poverty lines
povlines_list = povlines_dict[ppp_version]
povlines_list = POVLINES_DICT[ppp_version]

# Define groups of columns
headcount_absolute_cols = [f"headcount_{p}" for p in povlines_list]
Expand Down
130 changes: 122 additions & 8 deletions snapshots/wb/2024-03-27/pip_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
To run this code from scratch,
- Connect to the staging server of this pull request:
- Hit Cmd + Shift + P and select Remote-SSH: Connect to Host
- Type in owid@staging-site-{pull_request_name}
- Type in owid@staging-site-{branch_name}
- Delete the files in the cache folder:
rm -rf .cache/*
- Check if you need to update the poverty lines in the functions `poverty_lines_countries` and `poverty_lines_regions`. Run
Expand Down Expand Up @@ -1416,27 +1416,71 @@ def concurrent_region_function():

return results

def get_china_india_data_filled(povline, ppp_version, versions):
"""
This function extracts filled data for China and India to be used in the key indicators file.
"""
return pip_query_country(
wb_api,
popshare_or_povline="povline",
value=povline / 100,
versions=versions,
country_code="CHN&country=IND",
year="all",
fill_gaps="true",
welfare_type="all",
reporting_level="national",
ppp_version=ppp_version,
download="false",
)

def concurrent_function_china_india():
"""
This function makes concurrency work for China and India data.
"""
with ThreadPool(MAX_WORKERS) as pool:
tasks = [
(povline, ppp_version, versions)
for ppp_version, povlines in POVLINES_DICT.items()
for povline in povlines
]
results = pool.starmap(get_china_india_data_filled, tasks)

# Concatenate list of dataframes
results = pd.concat(results, ignore_index=True)

return results

# Obtain latest versions of the PIP dataset
versions = pip_versions(wb_api)

# Run the main function
results = concurrent_function()
results_region = concurrent_region_function()

# Query China and India data
results_china_india = concurrent_function_china_india()

# Calculate World (excluding China) and World (excluding India) data
results_region = calculate_world_excluding_china_and_india(results_region, results_china_india)

# If country is nan but country_code is TWN, replace country with Taiwan, China
results.loc[results["country"].isnull() & (results["country_code"] == "TWN"), "country"] = "Taiwan, China"

# I check if the set of countries is the same in the df and in the aux table (list of countries)
aux_dict = pip_aux_tables(wb_api, table="countries")
assert set(results["country"].unique()) == set(aux_dict["countries"]["country_name"].unique()), log.fatal(
f"List of countries is not the same! Differences: {set(results['country'].unique()) - set(aux_dict['countries']['country_name'].unique())}"
assert set(results["country"]) == set(aux_dict["countries"]["country_name"]), log.fatal(
f"List of countries is not the same! Differences: {set(results['country']) - set(aux_dict['countries']['country_name'])}"
)

# I check if the set of regions is the same in the df and in the aux table (list of regions)
aux_dict = pip_aux_tables(wb_api, table="regions")
assert set(results_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal(
f"List of regions is not the same! Differences: {set(results_region['country'].unique()) - set(aux_dict['regions']['region'].unique())}"
)
# # I check if the set of regions is the same in the df and in the aux table (list of regions) + World (excluding China) + World (excluding India)
# aux_dict = pip_aux_tables(wb_api, table="regions")

# countries_to_check = set(aux_dict["regions"]["region"]) | {"World (excluding China)", "World (excluding India)"}

# assert set(results_region["country"]) == (countries_to_check), log.fatal(
# f"List of regions is not the same! Differences: {set(results_region['country']) - countries_to_check}"
# )

# Concatenate df_country and df_region
df = pd.concat([results, results_region], ignore_index=True)
Expand All @@ -1454,6 +1498,76 @@ def concurrent_region_function():
return df


def calculate_world_excluding_china_and_india(results_region: pd.DataFrame, results_china_india: pd.DataFrame):
"""
Calculate World (excluding China) and World (excluding India) data.
"""

results_region = results_region.copy()
results_china_india = results_china_india.copy()

# Filter results to show only World
results_world = results_region[results_region["country"] == "World"].copy().reset_index(drop=True)

# Keep country, year, poverty_line and headcount columns
results_world = results_world[["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]]
results_china_india = results_china_india[
["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]
]

# Create headcount_ratio column
results_world["headcount_number"] = results_world["headcount"] * results_world["reporting_pop"]
results_china_india["headcount_number"] = results_china_india["headcount"] * results_china_india["reporting_pop"]

# Make these columns integer
results_world["headcount_number"] = results_world["headcount_number"].astype(int)
results_china_india["headcount_number"] = results_china_india["headcount_number"].astype(int)

# Merge results_world and results_china_india
results_excluding = pd.merge(
results_china_india,
results_world,
on=["ppp_version", "year", "poverty_line"],
how="left",
suffixes=("", "_world"),
)

# Calculate headcount_excluding as the difference between headcount_world and headcount
results_excluding["headcount_number_excluding"] = (
results_excluding["headcount_number_world"] - results_excluding["headcount_number"]
)

# Same with reporting_pop
results_excluding["reporting_pop_excluding"] = (
results_excluding["reporting_pop_world"] - results_excluding["reporting_pop"]
)

# Estimate headcount_excluding
results_excluding["headcount_excluding"] = (
results_excluding["headcount_number_excluding"] / results_excluding["reporting_pop_excluding"]
)

# Keep country, year , poverty_line, headcount_excluding and reporting_pop_excluding columns
results_excluding = results_excluding[
["ppp_version", "country", "year", "poverty_line", "headcount_excluding", "reporting_pop_excluding"]
]

# Rename countries to World (excluding China) and World (excluding India)
results_excluding["country"] = results_excluding["country"].replace(
{"China": "World (excluding China)", "India": "World (excluding India)"}
)

# Rename columns to headcount and reporting_pop
results_excluding = results_excluding.rename(
columns={"headcount_excluding": "headcount", "reporting_pop_excluding": "reporting_pop"}
)

# Concatenate tables
results_region = pd.concat([results_region, results_excluding], ignore_index=True)

return results_region


def median_patch(df, country_or_region):
"""
Patch missing values in the median column.
Expand Down
4 changes: 2 additions & 2 deletions snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@ meta:

wdir: ../../../data/snapshots/wb/2024-01-17
outs:
- md5: 5fb032d2de430f79f25e1bdf1259c9bf
size: 35764784
- md5: 89a74ce0a636f6b0e317664b99eebd51
size: 35912832
path: world_bank_pip.csv
4 changes: 2 additions & 2 deletions snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ meta:

wdir: ../../../data/snapshots/wb/2024-01-17
outs:
- md5: f5bb53372a6fd0f563d20d04b3c897c7
size: 49972432
- md5: 87ff2bcc5473da45f0c2f2a6837bef98
size: 49910607
path: world_bank_pip_percentiles.csv

0 comments on commit 6ba470d

Please sign in to comment.