owid · lucasrodes · Nov 8, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 30, 2023
diff --git a/etl/steps/data/garden/countries/2023-09-25/gleditsch.py b/etl/steps/data/garden/countries/2023-09-25/gleditsch.py
@@ -3,8 +3,10 @@
 import owid.catalog.processing as pr
 from owid.catalog import Dataset, Table
 from shared import (
+    LAST_YEAR,
     add_latest_years_with_constant_num_countries,
     add_population_to_table,
+    fill_timeseries,
     init_table_countries_in_region,
 )
 
@@ -33,12 +35,13 @@ def run(dest_dir: str) -> None:
     #
     tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
 
-    # Minor fix
-    tb.loc[tb["country"] == "German Federal Republic", "end"] = "02:10:1990"
-
     # Format table
     tb_formatted = format_table(tb)
 
+    # Minor fix
+    ## GW code 260 should be referred to as 'West Germany' until 1990, then as 'Germany'
+    tb_formatted.loc[(tb_formatted["id"] == 260) & (tb_formatted["year"] >= 1990), "country"] = "Germany"
+
     # Create new table
     tb_regions = create_table_countries_in_region(tb_formatted, ds_pop)
 
@@ -48,9 +51,13 @@ def run(dest_dir: str) -> None:
     # Combine tables
     tb_regions = tb_regions.merge(tb_pop, how="left", on=["region", "year"])
 
+    # Get table with id, year, country (whenever that country was present)
+    tb_countries = create_table_country_years(tb_formatted)
+
     # Add to table list
     tables = [
         tb.set_index(["id", "start", "end"], verify_integrity=True).sort_index(),
+        tb_countries.set_index(["id", "year"], verify_integrity=True).sort_index(),
         tb_regions.set_index(["region", "year"], verify_integrity=True).sort_index(),
     ]
 
@@ -87,6 +94,23 @@ def format_table(tb: Table) -> Table:
     return tb
 
 
+def create_table_country_years(tb: Table) -> Table:
+    """Create table with each country present in a year."""
+    tb_countries = tb[["id", "year", "country"]].copy()
+
+    # define mask for last year
+    mask = tb_countries["year"] == EXPECTED_LAST_YEAR
+
+    tb_last = fill_timeseries(
+        tb_countries[mask].drop(columns="year"),
+        EXPECTED_LAST_YEAR + 1,
+        LAST_YEAR,
+    )
+
+    tb = pr.concat([tb_countries, tb_last], ignore_index=True, short_name="gleditsch_countries")
+    return tb
+
+
 def create_table_countries_in_region(tb: Table, ds_pop: Dataset) -> Table:
     """Create table with number of countries in each region per year."""
     # Get number of countries per region per year

diff --git a/etl/steps/data/garden/countries/2023-09-25/isd.py b/etl/steps/data/garden/countries/2023-09-25/isd.py
@@ -3,8 +3,10 @@
 import owid.catalog.processing as pr
 from owid.catalog import Table
 from shared import (
+    LAST_YEAR,
     add_latest_years_with_constant_num_countries,
     add_population_to_table,
+    fill_timeseries,
     init_table_countries_in_region,
 )
 from structlog import get_logger
@@ -60,10 +62,14 @@ def run(dest_dir: str) -> None:
     # Combine tables
     tb_regions = tb_regions.merge(tb_pop, how="left", on=["region", "year"])
 
+    # Get table with id, year, country (whenever that country was present)
+    tb_countries = create_table_country_years(tb_formatted)
+
     # Add to tables list
     tables = [
         tb.set_index(["cownum", "start", "end"], verify_integrity=True).sort_index(),
         tb_regions.set_index(["region", "year"], verify_integrity=True).sort_index(),
+        tb_countries.set_index(["id", "year"], verify_integrity=True).sort_index(),
     ]
 
     # tb = tb.set_index(["country", "year"], verify_integrity=True)
@@ -237,3 +243,35 @@ def code_to_region_alt(cow_code: int) -> str:
             return "North Africa and the Middle East"
         case _:
             return "Rest"
+
+
+def create_table_country_years(tb: Table) -> Table:
+    """Create table with each country present in a year."""
+    tb_countries = (
+        tb[["cownum", "year", "statename"]]
+        .copy()
+        .rename(
+            columns={
+                "cownum": "id",
+                "statename": "country",
+            }
+        )
+    )
+
+    # define mask for last year
+    mask = tb_countries["year"] == EXPECTED_LAST_YEAR
+
+    tb_last = fill_timeseries(
+        tb_countries[mask].drop(columns="year"),
+        EXPECTED_LAST_YEAR + 1,
+        LAST_YEAR,
+    )
+
+    tb = pr.concat([tb_countries, tb_last], ignore_index=True, short_name="isd_countries")
+
+    # Fix country names
+    ## Serbia and Montenegro, Serbia
+    tb["country"] = tb["country"].astype(str)
+    tb.loc[(tb["id"] == 345) & (tb["year"] >= 1992) & (tb["year"] < 2006), "country"] = "Serbia and Montenegro"
+    tb.loc[(tb["id"] == 345) & (tb["year"] >= 2006), "country"] = "Serbia"
+    return tb
diff --git a/etl/steps/data/garden/countries/2023-09-25/shared.py b/etl/steps/data/garden/countries/2023-09-25/shared.py
@@ -1,4 +1,5 @@
 from datetime import datetime as dt
+from typing import Optional, cast
 
 import owid.catalog.processing as pr
 import pandas as pd
@@ -45,13 +46,17 @@ def init_table_countries_in_region(
     return tb_regions
 
 
-def add_latest_years_with_constant_num_countries(tb_regions: Table, column_year: str, expected_last_year: int) -> Table:
+def add_latest_years_with_constant_num_countries(
+    tb: Table,
+    column_year: str,
+    expected_last_year: int,
+) -> Table:
     """Extend data until LAST_YEAR with constant number of countries.
 
     Data stops at expected_last_year, extend it until LAST_YEAR with constant number of countries.
     """
     # Check latest year is as expected, drop year column
-    tb_last = tb_regions.sort_values(column_year).drop_duplicates(subset=["region"], keep="last")
+    tb_last = tb.sort_values(column_year).drop_duplicates(subset=["region"], keep="last")
     assert (tb_last.year.unique() == expected_last_year).all(), f"Last year is not {expected_last_year}!"
     tb_last = tb_last.drop(columns=[column_year])
 
@@ -60,9 +65,9 @@ def add_latest_years_with_constant_num_countries(tb_regions: Table, column_year:
     tb_last = tb_last[["region", "number_countries"]].merge(tb_all_years, how="cross")
 
     # Add to main table
-    tb_regions = pr.concat([tb_regions, tb_last], ignore_index=True).sort_values(["region", column_year])
+    tb = pr.concat([tb, tb_last], ignore_index=True).sort_values(["region", column_year])
 
-    return tb_regions
+    return tb
 
 
 def expand_observations(tb: Table, col_year_start: str, col_year_end: str) -> Table:
@@ -75,14 +80,61 @@ def expand_observations(tb: Table, col_year_start: str, col_year_end: str) -> Ta
     # Add missing years for each triplet ("warcode", "campcode", "ccode")
     YEAR_MIN = tb[col_year_start].min()
     YEAR_MAX = tb[col_year_end].max()
-    tb_all_years = Table(pd.RangeIndex(YEAR_MIN, YEAR_MAX + 1), columns=["year"])
-    tb = tb.merge(tb_all_years, how="cross")
+    if "year" in tb.columns:
+        raise ValueError("Column 'year' already in table!")
+    else:
+        tb = fill_timeseries(tb, YEAR_MIN, YEAR_MAX)
     # Filter only entries that actually existed
     tb = tb[(tb["year"] >= tb[col_year_start]) & (tb["year"] < tb[col_year_end])]
 
     return tb
 
 
+def fill_timeseries(
+    tb: Table,
+    year_min: Optional[int],
+    year_max: Optional[int],
+    default_min: bool = False,
+    default_max: bool = False,
+    col_year_start: Optional[str] = None,
+    col_year_end: Optional[str] = None,
+    filter_times: bool = False,
+) -> Table:
+    """Complement table with missing years."""
+    # Get starting year
+    if default_min:
+        if col_year_start in tb.columns:
+            year_min = tb[col_year_start].min()
+        else:
+            raise ValueError(f"{col_year_start} not in table columns!")
+    elif year_min is None:
+        raise ValueError("Either `year_min` must be a value or `default_min` must be True")
+    # Get ending year
+    if default_max:
+        if (col_year_end) and (col_year_end in tb.columns):
+            year_max = tb[col_year_end].max()
+        else:
+            raise ValueError(f"{col_year_end} not in table columns!")
+    elif year_max is None:
+        raise ValueError("Either `year_max` must be a value or `default_max` must be True")
+
+    # Cross merge with missing years
+    tb_all_years = Table(pd.RangeIndex(year_min, cast(int, year_max) + 1), columns=["year"])
+    if "year" in tb.columns:
+        raise ValueError("Column 'year' already in table! Please drop it from `tb`.")
+    tb = tb.merge(tb_all_years, how="cross")
+
+    # Only keep years that 'make sense'
+    if filter_times:
+        if (col_year_end and (col_year_end not in tb.columns)) or (
+            col_year_start and (col_year_start not in tb.columns)
+        ):
+            raise ValueError(f"Columns {col_year_start} and {col_year_end} must be in table columns!")
+        else:
+            tb = tb[(tb["year"] >= tb[col_year_start]) & (tb["year"] < tb[col_year_end])]
+    return tb
+
+
 def _get_start_year(date_str: str, date_format: str) -> int:
     date = dt.strptime(date_str, date_format)
     return date.year

diff --git a/etl/steps/data/garden/countries/2023-09-29/cow_ssm.py b/etl/steps/data/garden/countries/2023-09-29/cow_ssm.py
@@ -1,5 +1,7 @@
 """Load a meadow dataset and create a garden dataset."""
 
+from typing import Optional, cast
+
 import owid.catalog.processing as pr
 import pandas as pd
 from owid.catalog import Dataset, Table
@@ -66,12 +68,16 @@ def run(dest_dir: str) -> None:
     # Combine tables
     tb_regions = tb_regions.merge(tb_pop, how="left", on=["region", "year"])
 
+    # Get table with id, year, country (whenever that country was present)
+    tb_countries = create_table_country_years(tb)
+
     # Group tables and format tables
     tables = [
         tb_system.set_index(["ccode", "year"], verify_integrity=True).sort_index(),
         tb_states.set_index(["ccode", "styear", "stmonth", "stday", "endyear", "endmonth", "endday"]).sort_index(),
         tb_majors.set_index(["ccode", "styear", "stmonth", "stday", "endyear", "endmonth", "endday"]).sort_index(),
         tb_regions.set_index(["region", "year"], verify_integrity=True).sort_index(),
+        tb_countries.set_index(["id", "year"], verify_integrity=True).sort_index(),
     ]
 
     #
@@ -170,3 +176,69 @@ def add_population_to_table(tb: Table, ds_pop: Dataset, country_col: str = "coun
     tb_pop = pr.concat([tb_pop_regions, tb_pop_world], ignore_index=True)
 
     return tb_pop
+
+
+def create_table_country_years(tb: Table) -> Table:
+    """Create table with each country present in a year."""
+    tb_countries = tb[["ccode", "year", "statenme"]].copy()
+
+    tb_countries = tb_countries.rename(columns={"ccode": "id", "statenme": "country"})
+
+    # define mask for last year
+    mask = tb_countries["year"] == EXPECTED_LAST_YEAR
+
+    tb_last = fill_timeseries(
+        tb_countries[mask].drop(columns="year"),
+        EXPECTED_LAST_YEAR + 1,
+        LAST_YEAR,
+    )
+
+    tb = pr.concat([tb_countries, tb_last], ignore_index=True, short_name="cow_ssm_countries")
+
+    tb["year"] = tb["year"].astype(int)
+    return tb
+
+
+def fill_timeseries(
+    tb: Table,
+    year_min: Optional[int],
+    year_max: Optional[int],
+    default_min: bool = False,
+    default_max: bool = False,
+    col_year_start: Optional[str] = None,
+    col_year_end: Optional[str] = None,
+    filter_times: bool = False,
+) -> Table:
+    """Complement table with missing years."""
+    # Get starting year
+    if default_min:
+        if col_year_start in tb.columns:
+            year_min = tb[col_year_start].min()
+        else:
+            raise ValueError(f"{col_year_start} not in table columns!")
+    elif year_min is None:
+        raise ValueError("Either `year_min` must be a value or `default_min` must be True")
+    # Get ending year
+    if default_max:
+        if (col_year_end) and (col_year_end in tb.columns):
+            year_max = tb[col_year_end].max()
+        else:
+            raise ValueError(f"{col_year_end} not in table columns!")
+    elif year_max is None:
+        raise ValueError("Either `year_max` must be a value or `default_max` must be True")
+
+    # Cross merge with missing years
+    tb_all_years = Table(pd.RangeIndex(year_min, cast(int, year_max) + 1), columns=["year"])
+    if "year" in tb.columns:
+        raise ValueError("Column 'year' already in table! Please drop it from `tb`.")
+    tb = tb.merge(tb_all_years, how="cross")
+
+    # Only keep years that 'make sense'
+    if filter_times:
+        if (col_year_end and (col_year_end not in tb.columns)) or (
+            col_year_start and (col_year_start not in tb.columns)
+        ):
+            raise ValueError(f"Columns {col_year_start} and {col_year_end} must be in table columns!")
+        else:
+            tb = tb[(tb["year"] >= tb[col_year_start]) & (tb["year"] < tb[col_year_end])]
+    return tb
diff --git a/etl/steps/data/garden/war/2023-09-21/brecke.meta.yml b/etl/steps/data/garden/war/2023-09-21/brecke.meta.yml
@@ -36,7 +36,8 @@ definitions:
     description_key: &description_key_deaths
       - |-
         {definitions.all.conflict_type_ongoing}
-      - {definitions.all.interstate_conflicts}
+      - |
+        {definitions.all.interstate_conflicts}
       - Deaths of combatants and civilians due to fighting, disease, and starvation are included.
       - For conflicts without any deaths estimate, we conversatively coded the Conflict Catalog's lower bound for including a conflict, 32 deaths each year.
 
@@ -52,7 +53,8 @@ definitions:
     description_key: &description_key_ongoing
       - |-
         {definitions.all.conflict_type_ongoing}
-      - {definitions.all.interstate_conflicts}
+      - |
+        {definitions.all.interstate_conflicts}
       - We count a conflict as ongoing in a region even if the conflict is also ongoing in other regions. The sum across all regions can therefore be higher than the total number of ongoing conflicts.
 
   number_new_conflicts:
@@ -73,7 +75,8 @@ definitions:
         <% elif conflict_type == "internal" %>
         A new internal conflict is a conflict between a state and a non-state armed groups, between non-state armed groups, or between an armed group and civilians, that causes at least 32 deaths during a year for the first time.
         <% endif %>
-      - {definitions.all.interstate_conflicts}
+      - |
+        {definitions.all.interstate_conflicts}
       - |-
         <% if (conflict_type == "interstate" or conflict_type == "internal") %>
         We count a conflict as new in a region even if the conflict started at the same time in another region. The sum across all regions can therefore be higher than the total number of new conflicts.

diff --git a/etl/steps/data/garden/war/2023-09-21/cow.meta.yml b/etl/steps/data/garden/war/2023-09-21/cow.meta.yml
@@ -175,6 +175,26 @@ dataset:
     You can find more information about the data in our article: [To be published]
 
 tables:
+
+  # COUNTRY-LEVEL
+  cow_country:
+    variables:
+      participated_in_conflict:
+        title: Participated in conflict
+        unit: ""
+        display:
+          numDecimalPlaces: 0
+        description_short: |-
+          Whether the country participated in a conflict (of a specific kind) in a given year.
+
+      number_participants:
+        title: Number of countries in conflict
+        unit: "countries"
+        display:
+          numDecimalPlaces: 0
+        description_short: |-
+          The number of countries that participated in a conflict (of a specific kind) in a given year and region.
+
   cow:
     variables:
       ##################