number of countries indicators

owid · May 14, 2024 · d2cddab · d2cddab
1 parent cd44b7a
commit d2cddab
Show file tree

Hide file tree

Showing 6 changed files with 231 additions and 12 deletions.
diff --git a/dag/democracy.yml b/dag/democracy.yml
@@ -44,5 +44,7 @@ steps:
     - snapshot://democracy/2024-05-13/polity.xlsx
   data://garden/democracy/2024-03-07/polity:
     - data://meadow/democracy/2024-05-13/polity
+    - data://garden/regions/2023-01-01/regions
+    - data://garden/demography/2023-03-31/population
   data://grapher/democracy/2024-05-13/polity:
     - data://garden/democracy/2024-03-07/polity
diff --git a/etl/steps/data/garden/democracy/2024-03-07/polity.countries_impute.yml b/etl/steps/data/garden/democracy/2024-03-07/polity.countries_impute.yml
@@ -93,8 +93,8 @@
 # Serbia and Montenegro
 - country:
     - Serbia
-    - Montenegro
     - Kosovo
+    - Montenegro
   country_impute: Serbia and Montenegro
   year_min: 1991
   year_max: 2005

diff --git a/etl/steps/data/garden/democracy/2024-03-07/polity.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/polity.meta.yml
@@ -27,6 +27,8 @@ tables:
           It combines information on the extent to which open, multi-party, and competitive elections choose a chief executive who faces comprehensive institutional constraints, and political participation is competitive.
 
           It ranges from -10 to 10 (fully democratic).
+        description_processing: |-
+          Values for continents have been obtained by averaging the values of the countries in the continent.
         description_from_producer: |-
           Indicator name: `polity2`
 
@@ -200,7 +202,9 @@ tables:
         description_short: |-
           It combines information on the extent to which open, multi-party, and competitive elections choose a chief executive who faces comprehensive institutional constraints, and political participation is competitive. It ranges from 0 to 20 (fully democratic).
         description_processing:
-          It matches the variable polity2 in Polity 5 (2021), with the values rescaled to range from 0 to 20 instead of -10 to 10.
+          It matches the variable `polity2` in Polity 5 (2021), with the values rescaled to range from 0 to 20 instead of -10 to 10.
+
+          Values for continents have been obtained by averaging the values of the countries in the continent.
 
       age_dem_polity:
         title: Democracy age
@@ -228,3 +232,24 @@ tables:
           - Democracies are understood here as having broadly open, multi-party, and competitive elections choosing a chief executive who faces comprehensive institutional constraints, and competitive political participation.
           - The variable distinguishes between autocracies (score of 0), anocracies (score of 1), democracies aged 1-18 years (score of 2), 19-30 years (score of 3), 31-60 years (score of 4), 61-90 years (score of 5), and 91+ years (score of 6).
 
+
+  num_countries:
+    variables:
+      num_regime_polity:
+        title: |-
+          <% if category == '-1' %>
+          Number of countries with unknown regime
+          <% else %>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>
+          <% endif %>
+        unit: "countries"
+      num_group_age_dem_polity:
+        title: |-
+          <% if 'years' in category %>
+          Number of electoral democracies aged << category >>
+          <% elif category == '-1' %>
+          Number of countries with unknown regime (age groups)
+          <% else %>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups)
+          <% endif %>
+        unit: "countries"
diff --git a/etl/steps/data/garden/democracy/2024-03-07/polity.py b/etl/steps/data/garden/democracy/2024-03-07/polity.py
@@ -40,9 +40,20 @@
     5 "competitive"
 """
 
+from typing import Tuple, cast
+
+import numpy as np
 import pandas as pd
-from owid.catalog import Table
-from shared import add_age_groups, add_count_years_in_regime, add_imputes
+from owid.catalog import Dataset, Table
+from owid.catalog.tables import concat
+from shared import (
+    add_age_groups,
+    add_count_years_in_regime,
+    add_imputes,
+    add_regions_and_global_aggregates,
+    from_wide_to_long,
+    make_table_with_dummies,
+)
 
 from etl.data_helpers import geo
 from etl.helpers import PathFinder, create_dataset
@@ -58,13 +69,36 @@
     1: "anocracy",
 }
 
+# Missing classifications of states
+REGIONS = {
+    "Africa": {},
+    "Asia": {},
+    "North America": {
+        "additional_members": [
+            "United Provinces of Central America",
+        ]
+    },
+    "South America": {},
+    "Europe": {
+        "additional_members": [
+            "Prussia",
+        ]
+    },
+    "Oceania": {},
+}
+
+# Year range for aggregates
+YEAR_AGG_MIN = 1800
+YEAR_AGG_MAX = 2018
+
 
 def run(dest_dir: str) -> None:
     #
     # Load inputs.
     #
     # Load meadow dataset.
     ds_meadow = paths.load_dataset("polity")
+    ds_regions = paths.load_dataset(short_name="regions")
 
     # Read table from meadow dataset.
     tb = ds_meadow["polity"].reset_index()
@@ -103,24 +137,44 @@ def run(dest_dir: str) -> None:
     col_flag_imputed = "values_imputed"
     tb = add_imputes(tb=tb, path=PATH_IMPUTE, col_flag_imputed=col_flag_imputed)
 
+    # Remove countries to avoid overlaps
+    # tb = tb.loc[~((tb["country"] == "USSR") & (tb["year"] == 1991))]
+
+    ##################################################
+    # AGGREGATES
+
+    # Get country-count-related data: country-averages, number of countries, ...
+    tb_num_countries, tb_avg_countries = get_country_data(tb, ds_regions)
+
+    # Get population-related data: population-weighed averages, people livin in ...
+    tb_population = get_population_data(tb)
+
     # Get region data
     # tb_regions = tb.loc[~tb[col_flag_imputed]].drop(columns=[col_flag_imputed]).copy()
 
-    # Drop is imputed flag
-    tb = tb.drop(columns=[col_flag_imputed])
+    ##################################################
+
+    # Add regions to main table
+    tb = concat([tb, tb_avg_countries], ignore_index=True)
 
     # Remove columns
-    tb = tb.drop(columns=["ccode"])
+    tb = tb.drop(columns=[col_flag_imputed, "ccode"])
 
     # Format table
     tb = tb.format(["country", "year"])
+    tb_num_countries = tb_num_countries.format(["country", "year", "category"], short_name="num_countries")
 
     #
     # Save outputs.
     #
+    tables = [
+        tb,
+        tb_num_countries,
+    ]
+
     # Create a new garden dataset with the same metadata as the meadow dataset.
     ds_garden = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+        dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata
     )
 
     # Save changes in the new garden dataset.
@@ -139,6 +193,16 @@ def harmonize_country_names(tb: Table) -> Table:
     ## Fix Pakistan entity (ccode = 769 is actually 'Pakistan (former)')
     tb["country"] = tb["country"].astype("string")
     tb.loc[tb["ccode"] == 769, "country"] = "Pakistan (former)"
+
+    ## Fix Sudan: remove former country for 2011 (already have north/south sudan data that year)
+    tb = tb.loc[~((tb["ccode"] == 625) & (tb["year"] == 2011))]
+
+    ## Fix Serbia and Montenegro: remove data for Serbia@2006 (already have S&M data that year)
+    # tb = tb.loc[~((tb["ccode"] == 342) & (tb["year"] == 2006))]
+
+    ## Fix Ethiopia (former): remove that @1993 (already have ethiopia/eritrea)
+    tb = tb.loc[~((tb["ccode"] == 530) & (tb["year"] == 1993))]
+
     ## Classic harmization
     tb = geo.harmonize_countries(
         df=tb,
@@ -183,3 +247,118 @@ def add_age_and_experience(tb: Table) -> Table:
         tb[col_age] = tb[col_age].astype("string")
 
     return tb
+
+
+def get_country_data(tb: Table, ds_regions: Dataset) -> Tuple[Table, Table]:
+    """Estimate number of countries in each regime, and country-average for some indicators.
+
+    Returns two tables:
+
+    1) tb_num_countres: Counts countries in different categories
+        regime_polity (counts)
+            - Number of autocracies
+            - Number of anocracies
+            - Number of democracies
+
+        group_age_dem_polity (counts)
+            - Number of democracies aged 1-18 years
+            - Number of democracies aged 19-30 years
+            - Number of democracies aged 31-60 years
+            - Number of democracies aged 61-90 years
+            - Number of democracies aged 91+ years
+
+    2) tb_avg_countries: Country-average for some indicators
+        - democracy_polity (country-average)
+        - democracy_recod_polity (country-average)
+    """
+    tb_ = tb.loc[~tb["values_imputed"]].copy()
+
+    # 1/ COUNT COUNTRIES
+    # Keep only non-imputed data
+    tb_num = tb_.copy()
+
+    # Set INTs
+    tb_num = tb_num.astype(
+        {
+            "regime_polity": "Int64",
+        }
+    )
+    tb_num = cast(Table, tb_num)
+
+    # Define columns on which we will estimate (i) "number of countries" and (ii) "number of people living in ..."
+    indicators = [
+        {
+            "name": "regime_polity",
+            "name_new": "num_regime_polity",
+            "values_expected": {"0": "autocracy", "1": "anocracy", "2": "democracy"},
+            "has_na": True,
+        },
+        {
+            "name": "group_age_dem_polity",
+            "name_new": "num_group_age_dem_polity",
+            "values_expected": {
+                "autocracy": "autocracy",
+                "anocracy": "anocracy",
+                "1-18 years": "1-18 years",
+                "19-30 years": "19-30 years",
+                "31-60 years": "31-60 years",
+                "61-90 years": "61-90 years",
+                "91+ years": "91+ years",
+            },
+            "has_na": True,
+        },
+    ]
+
+    # Column per indicator-dimension
+    tb_num = make_table_with_dummies(tb_num, indicators)
+
+    # Add regions and global aggregates
+    tb_num = add_regions_and_global_aggregates(tb_num, ds_regions, regions=REGIONS)
+    tb_num = from_wide_to_long(tb_num)
+
+    # Keep only certain year range
+    tb_num = tb_num.loc[tb_num["year"].between(YEAR_AGG_MIN, YEAR_AGG_MAX)]
+
+    # 2/ COUNTRY-AVERAGE INDICATORS
+    tb_avg = tb_.copy()
+    indicators_avg = ["democracy_polity", "democracy_recod_polity"]
+
+    # Keep only relevant columns
+    tb_avg = tb_avg.loc[:, ["year", "country"] + indicators_avg]
+
+    # Estimate region aggregates
+    tb_avg = add_regions_and_global_aggregates(
+        tb=tb_avg,
+        ds_regions=ds_regions,
+        aggregations={k: "mean" for k in indicators_avg},  # type: ignore
+        aggregations_world={k: np.mean for k in indicators_avg},  # type: ignore
+        regions=REGIONS,
+    )
+
+    # Keep only certain year range
+    tb_avg = tb_avg.loc[tb_avg["year"].between(YEAR_AGG_MIN, YEAR_AGG_MAX)]
+
+    return tb_num, tb_avg
+
+
+def get_population_data(tb: Table) -> Table:
+    """Estimate people living in each regime, and population-weighted averages for some indicators.
+
+    regime_polity (people living)
+        - People living in autocracies
+        - People living in anocracies
+        - People living in democracies
+
+    group_age_dem_polity (people living)
+        - People living in democracies aged 1-18 years
+        - People living in democracies aged 19-30 years
+        - People living in democracies aged 31-60 years
+        - People living in democracies aged 61-90 years
+        - People living in democracies aged 91+ years
+
+    democracy_polity (population-weighed-average)
+
+    """
+    tb_ = tb.copy()
+
+    return tb_
diff --git a/etl/steps/data/garden/democracy/2024-03-07/shared.py b/etl/steps/data/garden/democracy/2024-03-07/shared.py
@@ -403,7 +403,11 @@ def add_age_groups(
 
 
 def add_imputes(
-    tb: Table, path: Path, cols_verify: List[str] | None = None, col_flag_imputed: str | None = None
+    tb: Table,
+    path: Path,
+    cols_verify: List[str] | None = None,
+    col_flag_imputed: str | None = None,
+    verify_integrity: bool = True,
 ) -> Table:
     """Add imputed values to the table.
 
@@ -443,7 +447,7 @@ def add_imputes(
         assert tb_imp_["year"].max() == impute["year_max"], f"Missing years (max check) for {impute['country_impute']}"
         assert (a := tb_imp_["year"].min()) == (
             b := impute["year_min"]
-        ), f"Missing years (min check) for {impute['country_impute']}: {a} != {b}"
+        ), f"Missing years (min check) for {impute['country']} imputed from {impute['country_impute']}: {a} != {b}"
 
         # Tweak them
         # tb_ = tb_.rename(
@@ -479,5 +483,6 @@ def add_imputes(
     # tb_ = cast(Table, tb_[cols])
 
     # Verify that there are no duplicates
-    tb_ = tb_.set_index(cols_verify, verify_integrity=True).sort_index().reset_index()
+    if verify_integrity:
+        tb_ = tb_.set_index(cols_verify, verify_integrity=True).sort_index().reset_index()
     return tb_
diff --git a/etl/steps/data/grapher/democracy/2024-05-13/polity.py b/etl/steps/data/grapher/democracy/2024-05-13/polity.py
@@ -15,17 +15,25 @@ def run(dest_dir: str) -> None:
 
     # Read table from garden dataset.
     tb = ds_garden["polity"]
+    tb_num_countries = ds_garden["num_countries"]
 
     #
     # Process data.
     #
+    tables = [
+        tb,
+        tb_num_countries,
+    ]
 
     #
     # Save outputs.
     #
     # Create a new grapher dataset with the same metadata as the garden dataset.
     ds_grapher = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+        dest_dir,
+        tables=tables,
+        check_variables_metadata=True,
+        default_metadata=ds_garden.metadata,
     )
 
     # Save changes in the new grapher dataset.