From 6ba470df6f63098093513e8205af75db1365b0a1 Mon Sep 17 00:00:00 2001
From: Pablo Arriagada <paarriagadap@gmail.com>
Date: Wed, 17 Apr 2024 10:08:50 -0400
Subject: [PATCH] Add World excluding China and India on PIP

---
 .../2024-03-27/world_bank_pip.countries.json  |   4 +-
 .../garden/wb/2024-03-27/world_bank_pip.py    |  37 +++--
 snapshots/wb/2024-03-27/pip_api.py            | 130 ++++++++++++++++--
 .../wb/2024-03-27/world_bank_pip.csv.dvc      |   4 +-
 .../world_bank_pip_percentiles.csv.dvc        |   4 +-
 5 files changed, 153 insertions(+), 26 deletions(-)

diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
index 73342a8a395..40913a0fe03 100644
--- a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
+++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
@@ -177,5 +177,7 @@
   "Sub-Saharan Africa": "Sub-Saharan Africa (PIP)",
   "Taiwan, China": "Taiwan",
   "Turkiye": "Turkey",
-  "Western and Central Africa": "Western and Central Africa (PIP)"
+  "Western and Central Africa": "Western and Central Africa (PIP)",
+  "World (excluding China)": "World (excluding China)",
+  "World (excluding India)": "World (excluding India)"
 }
\ No newline at end of file
diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
index 65564efb8b7..a7fa8d090ff 100644
--- a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
+++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
@@ -29,13 +29,13 @@
 
 # Define absolute poverty lines used depending on PPP version
 # NOTE: Modify if poverty lines are updated from source
-povlines_dict = {
+POVLINES_DICT = {
     2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000],
     2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000],
 }
 
 # Define regions in the dataset
-regions_list = [
+REGIONS_LIST = [
     "East Asia and Pacific (PIP)",
     "Eastern and Southern Africa (PIP)",
     "Europe and Central Asia (PIP)",
@@ -46,6 +46,8 @@
     "Sub-Saharan Africa (PIP)",
     "Western and Central Africa (PIP)",
     "World",
+    "World (excluding China)",
+    "World (excluding India)",
 ]
 
 # Set table format when printing
@@ -78,8 +80,8 @@ def run(dest_dir: str) -> None:
     tb_percentiles: Table = geo.harmonize_countries(df=tb_percentiles, countries_file=paths.country_mapping_path)
 
     # Show regional data from 1990 onwards
-    tb = regional_data_from_1990(tb, regions_list)
-    tb_percentiles = regional_data_from_1990(tb_percentiles, regions_list)
+    tb = regional_data_from_1990(tb, REGIONS_LIST)
+    tb_percentiles = regional_data_from_1990(tb_percentiles, REGIONS_LIST)
 
     # Amend the entity to reflect if data refers to urban or rural only
     tb = identify_rural_urban(tb)
@@ -90,18 +92,18 @@ def run(dest_dir: str) -> None:
 
     # Create stacked variables from headcount and headcount_ratio
     tb_2011, col_stacked_n_2011, col_stacked_pct_2011 = create_stacked_variables(
-        tb_2011, povlines_dict, ppp_version=2011
+        tb_2011, POVLINES_DICT, ppp_version=2011
     )
     tb_2017, col_stacked_n_2017, col_stacked_pct_2017 = create_stacked_variables(
-        tb_2017, povlines_dict, ppp_version=2017
+        tb_2017, POVLINES_DICT, ppp_version=2017
     )
 
     # Sanity checks. I don't run for percentile tables because that process was done in the extraction
     tb_2011 = sanity_checks(
-        tb_2011, povlines_dict, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
+        tb_2011, POVLINES_DICT, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
     )
     tb_2017 = sanity_checks(
-        tb_2017, povlines_dict, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
+        tb_2017, POVLINES_DICT, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
     )
 
     # Separate out consumption-only, income-only. Also, create a table with both income and consumption
@@ -582,7 +584,9 @@ def sanity_checks(
     cols_to_check = (
         col_headcount + col_headcount_ratio + col_povertygap + col_tot_shortfall + col_stacked_n + col_stacked_pct
     )
-    mask = tb[cols_to_check].isna().any(axis=1)
+    mask = (tb[cols_to_check].isna().any(axis=1)) & (
+        ~tb["country"].isin(["World (excluding China)", "World (excluding India)"])
+    )
     tb_error = tb[mask].reset_index(drop=True).copy()
 
     if not tb_error.empty:
@@ -781,7 +785,14 @@ def regional_headcount(tb: Table) -> Table:
 
     # Remove Western and Central and Eastern and Southern Africa. It's redundant with Sub-Saharan Africa (PIP)
     tb_regions = tb_regions[
-        ~tb_regions["country"].isin(["Western and Central Africa (PIP)", "Eastern and Southern Africa (PIP)"])
+        ~tb_regions["country"].isin(
+            [
+                "Western and Central Africa (PIP)",
+                "Eastern and Southern Africa (PIP)",
+                "World (excluding China)",
+                "World (excluding India)",
+            ]
+        )
     ].reset_index(drop=True)
 
     # Select needed columns and pivot
@@ -847,7 +858,7 @@ def survey_count(tb: Table) -> Table:
     Create survey count indicator, by counting the number of surveys available for each country in the past decade
     """
     # Remove regions from the table
-    tb_survey = tb[~tb["reporting_level"].isnull()].reset_index(drop=True).copy()
+    tb_survey = tb[~tb["country"].isin(REGIONS_LIST)].reset_index(drop=True).copy()
 
     min_year = int(tb_survey["year"].min())
     max_year = int(tb_survey["year"].max())
@@ -885,7 +896,7 @@ def survey_count(tb: Table) -> Table:
     tb_survey = tb_survey[["country", "year", "surveys_past_decade"]]
 
     # Merge with original table
-    tb = pr.merge(tb_survey, tb, on=["country", "year"], how="left")
+    tb = pr.merge(tb_survey, tb, on=["country", "year"], how="outer")
 
     return tb
 
@@ -1043,7 +1054,7 @@ def define_columns_for_ppp_comparison(tb: Table, id_cols: list, ppp_version: int
 
     tb = tb.reset_index()
     # Define poverty lines
-    povlines_list = povlines_dict[ppp_version]
+    povlines_list = POVLINES_DICT[ppp_version]
 
     # Define groups of columns
     headcount_absolute_cols = [f"headcount_{p}" for p in povlines_list]
diff --git a/snapshots/wb/2024-03-27/pip_api.py b/snapshots/wb/2024-03-27/pip_api.py
index 8b4d17d1c94..e7ce69b2d44 100644
--- a/snapshots/wb/2024-03-27/pip_api.py
+++ b/snapshots/wb/2024-03-27/pip_api.py
@@ -13,7 +13,7 @@
 To run this code from scratch,
     - Connect to the staging server of this pull request:
         - Hit Cmd + Shift + P and select Remote-SSH: Connect to Host
-        - Type in owid@staging-site-{pull_request_name}
+        - Type in owid@staging-site-{branch_name}
     - Delete the files in the cache folder:
         rm -rf .cache/*
     - Check if you need to update the poverty lines in the functions `poverty_lines_countries` and `poverty_lines_regions`. Run
@@ -1416,6 +1416,41 @@ def concurrent_region_function():
 
         return results
 
+    def get_china_india_data_filled(povline, ppp_version, versions):
+        """
+        This function extracts filled data for China and India to be used in the key indicators file.
+        """
+        return pip_query_country(
+            wb_api,
+            popshare_or_povline="povline",
+            value=povline / 100,
+            versions=versions,
+            country_code="CHN&country=IND",
+            year="all",
+            fill_gaps="true",
+            welfare_type="all",
+            reporting_level="national",
+            ppp_version=ppp_version,
+            download="false",
+        )
+
+    def concurrent_function_china_india():
+        """
+        This function makes concurrency work for China and India data.
+        """
+        with ThreadPool(MAX_WORKERS) as pool:
+            tasks = [
+                (povline, ppp_version, versions)
+                for ppp_version, povlines in POVLINES_DICT.items()
+                for povline in povlines
+            ]
+            results = pool.starmap(get_china_india_data_filled, tasks)
+
+        # Concatenate list of dataframes
+        results = pd.concat(results, ignore_index=True)
+
+        return results
+
     # Obtain latest versions of the PIP dataset
     versions = pip_versions(wb_api)
 
@@ -1423,20 +1458,29 @@ def concurrent_region_function():
     results = concurrent_function()
     results_region = concurrent_region_function()
 
+    # Query China and India data
+    results_china_india = concurrent_function_china_india()
+
+    # Calculate World (excluding China) and World (excluding India) data
+    results_region = calculate_world_excluding_china_and_india(results_region, results_china_india)
+
     # If country is nan but country_code is TWN, replace country with Taiwan, China
     results.loc[results["country"].isnull() & (results["country_code"] == "TWN"), "country"] = "Taiwan, China"
 
     # I check if the set of countries is the same in the df and in the aux table (list of countries)
     aux_dict = pip_aux_tables(wb_api, table="countries")
-    assert set(results["country"].unique()) == set(aux_dict["countries"]["country_name"].unique()), log.fatal(
-        f"List of countries is not the same! Differences: {set(results['country'].unique()) - set(aux_dict['countries']['country_name'].unique())}"
+    assert set(results["country"]) == set(aux_dict["countries"]["country_name"]), log.fatal(
+        f"List of countries is not the same! Differences: {set(results['country']) - set(aux_dict['countries']['country_name'])}"
     )
 
-    # I check if the set of regions is the same in the df and in the aux table (list of regions)
-    aux_dict = pip_aux_tables(wb_api, table="regions")
-    assert set(results_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal(
-        f"List of regions is not the same! Differences: {set(results_region['country'].unique()) - set(aux_dict['regions']['region'].unique())}"
-    )
+    # # I check if the set of regions is the same in the df and in the aux table (list of regions) + World (excluding China) + World (excluding India)
+    # aux_dict = pip_aux_tables(wb_api, table="regions")
+
+    # countries_to_check = set(aux_dict["regions"]["region"]) | {"World (excluding China)", "World (excluding India)"}
+
+    # assert set(results_region["country"]) == (countries_to_check), log.fatal(
+    #     f"List of regions is not the same! Differences: {set(results_region['country']) - countries_to_check}"
+    # )
 
     # Concatenate df_country and df_region
     df = pd.concat([results, results_region], ignore_index=True)
@@ -1454,6 +1498,76 @@ def concurrent_region_function():
     return df
 
 
+def calculate_world_excluding_china_and_india(results_region: pd.DataFrame, results_china_india: pd.DataFrame):
+    """
+    Calculate World (excluding China) and World (excluding India) data.
+    """
+
+    results_region = results_region.copy()
+    results_china_india = results_china_india.copy()
+
+    # Filter results to show only World
+    results_world = results_region[results_region["country"] == "World"].copy().reset_index(drop=True)
+
+    # Keep country, year, poverty_line and headcount columns
+    results_world = results_world[["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]]
+    results_china_india = results_china_india[
+        ["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]
+    ]
+
+    # Create headcount_ratio column
+    results_world["headcount_number"] = results_world["headcount"] * results_world["reporting_pop"]
+    results_china_india["headcount_number"] = results_china_india["headcount"] * results_china_india["reporting_pop"]
+
+    # Make these columns integer
+    results_world["headcount_number"] = results_world["headcount_number"].astype(int)
+    results_china_india["headcount_number"] = results_china_india["headcount_number"].astype(int)
+
+    # Merge results_world and results_china_india
+    results_excluding = pd.merge(
+        results_china_india,
+        results_world,
+        on=["ppp_version", "year", "poverty_line"],
+        how="left",
+        suffixes=("", "_world"),
+    )
+
+    # Calculate headcount_excluding as the difference between headcount_world and headcount
+    results_excluding["headcount_number_excluding"] = (
+        results_excluding["headcount_number_world"] - results_excluding["headcount_number"]
+    )
+
+    # Same with reporting_pop
+    results_excluding["reporting_pop_excluding"] = (
+        results_excluding["reporting_pop_world"] - results_excluding["reporting_pop"]
+    )
+
+    # Estimate headcount_excluding
+    results_excluding["headcount_excluding"] = (
+        results_excluding["headcount_number_excluding"] / results_excluding["reporting_pop_excluding"]
+    )
+
+    # Keep country, year , poverty_line, headcount_excluding and reporting_pop_excluding columns
+    results_excluding = results_excluding[
+        ["ppp_version", "country", "year", "poverty_line", "headcount_excluding", "reporting_pop_excluding"]
+    ]
+
+    # Rename countries to World (excluding China) and World (excluding India)
+    results_excluding["country"] = results_excluding["country"].replace(
+        {"China": "World (excluding China)", "India": "World (excluding India)"}
+    )
+
+    # Rename columns to headcount and reporting_pop
+    results_excluding = results_excluding.rename(
+        columns={"headcount_excluding": "headcount", "reporting_pop_excluding": "reporting_pop"}
+    )
+
+    # Concatenate tables
+    results_region = pd.concat([results_region, results_excluding], ignore_index=True)
+
+    return results_region
+
+
 def median_patch(df, country_or_region):
     """
     Patch missing values in the median column.
diff --git a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
index 4e5434ee522..e2af7a3f61c 100644
--- a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
+++ b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
@@ -26,6 +26,6 @@ meta:
 
 wdir: ../../../data/snapshots/wb/2024-01-17
 outs:
-  - md5: 5fb032d2de430f79f25e1bdf1259c9bf
-    size: 35764784
+  - md5: 89a74ce0a636f6b0e317664b99eebd51
+    size: 35912832
     path: world_bank_pip.csv
diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
index d7c1982d021..5512e88a66d 100644
--- a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
+++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
@@ -28,6 +28,6 @@ meta:
 
 wdir: ../../../data/snapshots/wb/2024-01-17
 outs:
-  - md5: f5bb53372a6fd0f563d20d04b3c897c7
-    size: 49972432
+  - md5: 87ff2bcc5473da45f0c2f2a6837bef98
+    size: 49910607
     path: world_bank_pip_percentiles.csv