From 7110724431dd0ab907dcdbf8fe6affdf71125447 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 22 Nov 2024 10:45:40 +0100 Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=93=8A=20war:=20ucdp=20check?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From ae2f1b6264799e0ddec1298bff4a04fb344f01b9 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 22 Nov 2024 11:28:55 +0100 Subject: [PATCH 2/8] wip --- snapshots/war/2024-11-22/ucdp_ced.py | 39 +++++++++++++++++++ .../2024-11-22/ucdp_ced_v24_01_24_09.csv.dvc | 28 +++++++++++++ .../war/2024-11-22/ucdp_ced_v24_0_10.csv.dvc | 28 +++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 snapshots/war/2024-11-22/ucdp_ced.py create mode 100644 snapshots/war/2024-11-22/ucdp_ced_v24_01_24_09.csv.dvc create mode 100644 snapshots/war/2024-11-22/ucdp_ced_v24_0_10.csv.dvc diff --git a/snapshots/war/2024-11-22/ucdp_ced.py b/snapshots/war/2024-11-22/ucdp_ced.py new file mode 100644 index 00000000000..0519de5618d --- /dev/null +++ b/snapshots/war/2024-11-22/ucdp_ced.py @@ -0,0 +1,39 @@ +"""Script to create a snapshot of dataset 'UCDP Candidate Events Dataset'. + +The UCDP Candidate Events Dataset (UCDP Candidate) is based on UCDP Georeferenced Event Dataset (UCDP GED), but published at a monthly release cycle. It makes available monthly releases of candidate events data with not more than a month’s lag globally. See codebook for similarieties and differences between the two products. + +Go to https://ucdp.uu.se/downloads/index.html#candidate to find latest available versions. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +VERSIONS = [ + "v24_0_10", + "v24_01_24_09", +] + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + for version in VERSIONS: + snapshot_path = f"war/{SNAPSHOT_VERSION}/ucdp_ced_{version}.csv" + snap = Snapshot(snapshot_path) + snap.download_from_source() + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/war/2024-11-22/ucdp_ced_v24_01_24_09.csv.dvc b/snapshots/war/2024-11-22/ucdp_ced_v24_01_24_09.csv.dvc new file mode 100644 index 00000000000..a7b661f0369 --- /dev/null +++ b/snapshots/war/2024-11-22/ucdp_ced_v24_01_24_09.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Uppsala Conflict Data Program + title: Candidate Events Dataset + description: |- + The UCDP Candidate Events Dataset (UCDP Candidate) is based on UCDP Georeferenced Event Dataset (UCDP GED), but published at a monthly release cycle. It makes available monthly releases of candidate events data with not more than a month’s lag globally. See codebook for similarieties and differences between the two products. + + You can find more notes at https://ucdp.uu.se/downloads/candidateged/ucdp-candidate-codebook1.3.pdf + title_snapshot: Candidate Events Dataset (January - September 2024) + description_snapshot: |- + This is a third quarterly export, covering events from January to September 2024. + citation_full: |- + Hegre, Håvard, Mihai Croicu, Kristine Eck, and Stina Högbladh, July 2020. Introducing the UCDP Candidate Events Dataset”, Research & Politics. doi:10.1177/2053168020935257 + + Högbladh Stina, 2023, “UCDP Candidate Events Dataset Codebook, v.1.2”, Department of Peace and Conflict Research, Uppsala University. + attribution_short: UCDP + version_producer: v24.01.24.09 + url_main: https://ucdp.uu.se/downloads/index.html#candidate + url_download: https://ucdp.uu.se/downloads/candidateged/GEDEvent_v24_01_24_09.csv + date_accessed: "2024-11-22" + date_published: "2024-10-20" + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ +outs: + - md5: 78a92b457f82f411c973c2795fbb7286 + size: 16856392 + path: ucdp_ced_v24_01_24_09.csv diff --git a/snapshots/war/2024-11-22/ucdp_ced_v24_0_10.csv.dvc b/snapshots/war/2024-11-22/ucdp_ced_v24_0_10.csv.dvc new file mode 100644 index 00000000000..aed250710f0 --- /dev/null +++ b/snapshots/war/2024-11-22/ucdp_ced_v24_0_10.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Uppsala Conflict Data Program + title: Candidate Events Dataset + description: |- + The UCDP Candidate Events Dataset (UCDP Candidate) is based on UCDP Georeferenced Event Dataset (UCDP GED), but published at a monthly release cycle. It makes available monthly releases of candidate events data with not more than a month’s lag globally. See codebook for similarieties and differences between the two products. + + You can find more notes at https://ucdp.uu.se/downloads/candidateged/ucdp-candidate-codebook1.3.pdf + title_snapshot: Candidate Events Dataset (October 2024) + description_snapshot: |- + This is a monthly release, covering events in October 2024. + citation_full: |- + Hegre, Håvard, Mihai Croicu, Kristine Eck, and Stina Högbladh, July 2020. Introducing the UCDP Candidate Events Dataset”, Research & Politics. doi:10.1177/2053168020935257 + + Högbladh Stina, 2023, “UCDP Candidate Events Dataset Codebook, v.1.2”, Department of Peace and Conflict Research, Uppsala University. + attribution_short: UCDP + version_producer: 24.0.10 + url_main: https://ucdp.uu.se/downloads/index.html#candidate + url_download: https://ucdp.uu.se/downloads/candidateged/GEDEvent_v24_0_10.csv + date_accessed: "2024-11-22" + date_published: "2024-10-20" + license: + name: CC BY 4.0 + url: https://creativecommons.org/licenses/by/4.0/ +outs: + - md5: a081ef74bc40dfa3bf79caa5406bfbe7 + size: 2268616 + path: ucdp_ced_v24_0_10.csv From 8b75c6bbb78b8deba4b888a964d35e7a85c7c793 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 22 Nov 2024 11:57:14 +0100 Subject: [PATCH 3/8] wip --- dag/war.yml | 13 + .../data/garden/war/2024-11-22/shared.py | 190 +++ .../war/2024-11-22/ucdp_monthly.meta.yml | 594 +++++++ .../garden/war/2024-11-22/ucdp_monthly.py | 1358 +++++++++++++++++ .../grapher/war/2024-11-22/ucdp_monthly.py | 50 + .../data/meadow/war/2024-11-22/ucdp_ced.py | 36 + 6 files changed, 2241 insertions(+) create mode 100644 etl/steps/data/garden/war/2024-11-22/shared.py create mode 100644 etl/steps/data/garden/war/2024-11-22/ucdp_monthly.meta.yml create mode 100644 etl/steps/data/garden/war/2024-11-22/ucdp_monthly.py create mode 100644 etl/steps/data/grapher/war/2024-11-22/ucdp_monthly.py create mode 100644 etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py diff --git a/dag/war.yml b/dag/war.yml index b0b9b1a529a..052b00a47b2 100644 --- a/dag/war.yml +++ b/dag/war.yml @@ -59,6 +59,19 @@ steps: data://grapher/war/2024-10-02/ucdp_monthly: - data://garden/war/2024-10-02/ucdp_monthly + # UCDP (candidate data) + data://meadow/war/2024-11-22/ucdp_ced: + - snapshot://war/2024-11-22/ucdp_ced_v24_0_10.csv + - snapshot://war/2024-11-22/ucdp_ced_v24_01_24_09.csv + data://garden/war/2024-11-22/ucdp_monthly: + - data://meadow/war/2024-11-22/ucdp_ced + - data://garden/demography/2024-07-15/population + - data://garden/geography/2023-11-28/nat_earth_110 + - data://meadow/war/2024-08-26/ucdp + - data://garden/countries/2024-08-27/gleditsch + data://grapher/war/2024-11-22/ucdp_monthly: + - data://garden/war/2024-11-22/ucdp_monthly + # PRIO v3.1 data://meadow/war/2023-09-21/prio_v31: - snapshot://war/2023-09-21/prio_v31.xls diff --git a/etl/steps/data/garden/war/2024-11-22/shared.py b/etl/steps/data/garden/war/2024-11-22/shared.py new file mode 100644 index 00000000000..e72e85e5b6d --- /dev/null +++ b/etl/steps/data/garden/war/2024-11-22/shared.py @@ -0,0 +1,190 @@ +from typing import List, Optional + +import numpy as np +import owid.catalog.processing as pr +from owid.catalog import Table + +from etl.data_helpers.misc import expand_time_column + + +def add_indicators_extra( + tb: Table, + tb_regions: Table, + columns_conflict_rate: Optional[List[str]] = None, + columns_conflict_mortality: Optional[List[str]] = None, +) -> Table: + """Scale original columns to obtain new indicators (conflict rate and conflict mortality indicators). + + CONFLICT RATE: + Scale columns `columns_conflict_rate` based on the number of countries (and country-pairs) in each region and year. + + For each indicator listed in `columns_to_scale`, two new columns are added to the table: + - `{indicator}_per_country`: the indicator value divided by the number of countries in the region and year. + - `{indicator}_per_country_pair`: the indicator value divided by the number of country-pairs in the region and year. + + CONFLICT MORTALITY: + Scale columns `columns_conflict_mortality` based on the population in each region. + + For each indicator listed in `columns_to_scale`, a new column is added to the table: + - `{indicator}_per_capita`: the indicator value divided by the number of countries in the region and year. + + + tb: Main table + tb_regions: Table with three columns: "year", "region", "num_countries". Gives the number of countries per region per year. + columns_to_scale: List with the names of the columns that need scaling. E.g. number_ongiong_conflicts -> number_ongiong_conflicts_per_country + """ + tb_regions_ = tb_regions.copy() + + # Sanity check 1: columns as expected in tb_regions + assert set(tb_regions_.columns) == { + "year", + "region", + "number_countries", + "population", + }, f"Invalid columns in tb_regions {tb_regions_.columns}" + # Sanity check 2: regions equivalent in both tables + regions_main = set(tb["region"]) + regions_aux = set(tb_regions_["region"]) + assert regions_main == regions_aux, f"Regions in main table and tb_regions differ: {regions_main} vs {regions_aux}" + + # Ensure full precision + tb_regions_["number_countries"] = tb_regions_["number_countries"].astype(float) + tb_regions_["population"] = tb_regions_["population"] # .astype(float) + # Get number of country-pairs + tb_regions_["number_country_pairs"] = ( + tb_regions_["number_countries"] * (tb_regions_["number_countries"] - 1) / 2 + ).astype(int) + + # Add number of countries and number of country pairs to main table + tb = tb.merge(tb_regions_, on=["year", "region"], how="left") + + if not columns_conflict_rate and not columns_conflict_mortality: + raise ValueError( + "Call to function is useless. Either provide `columns_conflict_rate` or `columns_conflict_mortality`." + ) + + # CONFLICT RATES ########### + if columns_conflict_rate: + # Add normalised indicators + for column_name in columns_conflict_rate: + # Add per country indicator + column_name_new = f"{column_name}_per_country" + tb[column_name_new] = (tb[column_name].astype(float) / tb["number_countries"].astype(float)).replace( + [np.inf, -np.inf], np.nan + ) + # Add per country-pair indicator + column_name_new = f"{column_name}_per_country_pair" + tb[column_name_new] = (tb[column_name].astype(float) / tb["number_country_pairs"].astype(float)).replace( + [np.inf, -np.inf], np.nan + ) + + # CONFLICT MORTALITY ########### + if columns_conflict_mortality: + # Add normalised indicators + for column_name in columns_conflict_mortality: + # Add per country indicator + column_name_new = f"{column_name}_per_capita" + tb[column_name_new] = ( + (100000 * tb[column_name].astype(float) / tb["population"]) + .replace([np.inf, -np.inf], np.nan) + .astype(float) + ) + + # Drop intermediate columns + tb = tb.drop(columns=["number_countries", "number_country_pairs", "population"]) + + return tb + + +def aggregate_conflict_types( + tb: Table, + parent_name: str, + children_names: Optional[List[str]] = None, + columns_to_aggregate: Optional[List[str]] = None, + columns_to_aggregate_absolute: Optional[List[str]] = None, + columns_to_groupby: Optional[List[str]] = None, + dim_name: str = "conflict_type", +) -> Table: + """Aggregate metrics in broader conflict types.""" + if columns_to_aggregate is None: + columns_to_aggregate = ["participated_in_conflict"] + if columns_to_groupby is None: + columns_to_groupby = ["year", "country", "id"] + if columns_to_aggregate_absolute is None: + columns_to_aggregate_absolute = [] + if children_names is None: + tb_agg = tb.copy() + else: + tb_agg = tb[tb[dim_name].isin(children_names)].copy() + # Obtain summations + tb_agg = tb_agg.groupby(columns_to_groupby, as_index=False).agg({col: sum for col in columns_to_aggregate}) + # Threshold to 1 for binary columns + threshold_upper = 1 + for col in columns_to_aggregate: + if col not in columns_to_aggregate_absolute: + tb_agg[col] = tb_agg[col].apply(lambda x: min(x, threshold_upper)) + # Add conflict type + tb_agg[dim_name] = parent_name + + # Combine + tb = pr.concat([tb, tb_agg], ignore_index=True) + return tb + + +def get_number_of_countries_in_conflict_by_region(tb: Table, dimension_name: str) -> Table: + """Get the number of countries participating in conflicts by region.""" + # Add region + tb_num_participants = add_region_from_code(tb) + tb_num_participants = tb_num_participants.drop(columns=["country"]).rename(columns={"region": "country"}) + + # Sanity check + assert not tb_num_participants["id"].isna().any(), "Some countries with NaNs!" + tb_num_participants = tb_num_participants.drop(columns=["id"]) + + # Groupby sum (regions) + tb_num_participants = tb_num_participants.groupby(["country", dimension_name, "year"], as_index=False)[ + "participated_in_conflict" + ].sum() + # Groupby sum (world) + tb_num_participants_world = tb_num_participants.groupby([dimension_name, "year"], as_index=False)[ + "participated_in_conflict" + ].sum() + tb_num_participants_world["country"] = "World" + # Combine + tb_num_participants = pr.concat([tb_num_participants, tb_num_participants_world], ignore_index=True) + tb_num_participants = tb_num_participants.rename(columns={"participated_in_conflict": "number_participants"}) + + # Complement with missing entries + tb_num_participants = expand_time_column( + tb_num_participants, + dimension_col=["country", dimension_name], + time_col="year", + method="full_range", + fillna_method="zero", + ) + + return tb_num_participants + + +def add_region_from_code(tb: Table, col_code: str = "id") -> Table: + """Add region to table based on code (gw, cow, isd).""" + + def _code_to_region_gw(code: int) -> str: + """Convert code to region name.""" + match code: + case c if 2 <= c <= 199: + return "Americas" + case c if 200 <= c <= 399: + return "Europe" + case c if 400 <= c <= 626: + return "Africa" + case c if 630 <= c <= 699: + return "Middle East" + case c if 700 <= c <= 999: + return "Asia and Oceania" + case _: + raise ValueError(f"Invalid GW code: {code}") + + tb_ = tb.copy() + tb_["region"] = tb_[col_code].apply(_code_to_region_gw) + return tb_ diff --git a/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.meta.yml b/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.meta.yml new file mode 100644 index 00000000000..da1b751d200 --- /dev/null +++ b/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.meta.yml @@ -0,0 +1,594 @@ +definitions: + common: + presentation: + topic_tags: + - War & Peace + display: + numDecimalPlaces: 0 + + all: + # Explanation of each conflict type + conflict_type_base: |- + This includes combatant and civilian deaths due to fighting + conflict_type: |- + <%- if conflict_type == "all" -%> + An armed conflict is a disagreement between organized groups, or between one organized group and civilians, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "state-based" -%> + A state-based conflict is a conflict between two armed groups, at least one of which is a state, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "interstate" -%> + An interstate conflict is a conflict between states that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "intrastate" -%> + An intrastate conflict is a conflict between a state and a non-state armed group that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. If a foreign state is involved, it is called "internationalized", and "non-internationalized" otherwise. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + An internationalized intrastate conflict is a conflict between a state and a non-state armed group, with involvement of a foreign state, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + An non-internationalized intrastate conflict is a conflict between a state and a non-state armed group, without involvement of a foreign state, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "extrasystemic" -%> + An extrasystemic conflict is a conflict between a state and a non-state armed group outside its territory that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "non-state conflict" -%> + A non-state conflict is a conflict between non-state armed groups, such as rebel groups, criminal organizations, or ethnic groups, that causes at least 25 deaths during a year. {definitions.all.conflict_type_base}. + + <%- elif conflict_type == "one-sided violence" -%> + One-sided violence is the use of armed force by a state or non-state armed group against civilians that causes at least 25 civilian deaths during a year. + + <%- endif -%> + location_conflicts_method: |- + UCDP provides geographical coordinates of each conflict event. We have mapped these coordinates to countries by means of the Natural Earth dataset. + + In some instances, the event's coordinates fall within the borders of a country. Other times, the event's coordinates fall outside the borders of a country. In the latter case, we have mapped the event to the country that is closest to the event's coordinates. + + Conflict event with id "53238" and relid "PAK-2003-1-345-88" was assigned to "Siachen Glacier" by Natural Earth. We have mapped it to "Pakistan" following the text in the `where_description` field from the Natural Earth data, which refers to "Giang sector in Siachen, Pakistani Kashmir". + + # Fields used for number of deaths indicators + number_deaths: + description_short: |- + <%- if conflict_type == "all" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "state-based" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in interstate, intrastate, and extrasystemic conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in internationalized intrastate conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in non-internationalized intrastate conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "one-sided violence" -%> + The << estimate >> estimate of the number of deaths of civilians from one-sided violence that was ongoing that year<< per_capita >>. + + <%- elif conflict_type == "non-state conflict" -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in non-state conflicts that were ongoing that year<< per_capita >>. + + <%- else -%> + The << estimate >> estimate of the number of deaths of combatants and civilians due to fighting in << conflict_type >> conflicts that were ongoing that year<< per_capita >>. + + <%- endif -%> + description_short_per_capita: <% set per_capita = ", per 100,000 people" %> + {definitions.number_deaths.description_short} + description_key: &description_key_deaths + - "{definitions.all.conflict_type}" + + number_deaths_type: + description_short: |- + <%- if conflict_type == "all" -%> + The best estimate of the number of deaths of << people_type >> in interstate, intrastate, extrasystemic, non-state conflicts, and one-sided violence that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "state-based" -%> + The best estimate of the number of deaths of << people_type >> in interstate, intrastate, and extrasystemic conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + The best estimate of the number of deaths of << people_type >> in internationalized intrastate conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + The best estimate of the number of deaths of << people_type >> in non-internationalized intrastate conflicts that were ongoing that year<< per_capita >>. + + <%- elif conflict_type == "one-sided violence" -%> + The << estimate >> estimate of the number of deaths of << people_type >> from one-sided violence that was ongoing that year<< per_capita >>. + + <%- elif conflict_type == "non-state conflict" -%> + The best estimate of the number of deaths of << people_type >> in non-state conflicts that were ongoing that year<< per_capita >>. + + <%- else -%> + The best estimate of the number of deaths of << people_type >> in << conflict_type >> conflicts that were ongoing that year<< per_capita >>. + + <%- endif -%> + description_short_per_capita: <% set per_capita = ", per 100,000 people" %> + {definitions.number_deaths_type.description_short} + description_key: &description_key_deaths_type + - "{definitions.all.conflict_type}" + + number_ongoing_conflicts: + description_short: |- + <%- if conflict_type == "all" -%> + Included are armed conflicts that were ongoing a year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + Included are internationalized intrastate conflicts that were ongoing a year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + Included are non-internationalized intrastate conflicts that were ongoing a year. + + <%- elif conflict_type == "non-state conflict" -%> + Included are non-state conflicts that were ongoing that year. + + <%- elif conflict_type == "one-sided violence" -%> + Included is one-sided violence that was ongoing that year. + + <%- elif conflict_type == "state-based" -%> + Included are interstate, intrastate, and extrasystemic conflicts that were ongoing that year. + + <%- else -%> + Included are << conflict_type >> conflicts that were ongoing a year. + + <%- endif -%> + description_key: &description_key_ongoing + - "{definitions.all.conflict_type}" + - We count a conflict as ongoing in a region even if the conflict is also ongoing in other regions. The sum across all regions can therefore be higher than the total number of ongoing conflicts. + + number_new_conflicts: + description_short: "{definitions.number_ongoing_conflicts.description_short}" + description_key: &description_key_new + - "{definitions.all.conflict_type}" + - We only count a conflict as new when the conflict overall started that year, not if it became active again. + - We count a conflict as new in a region even if the conflict started earlier or at the same time in another region. The sum across all regions can therefore be higher than the total number of new conflicts. + - |- + <%- if conflict_type == "intrastate (internationalized)" -%> + We count an internationalized intrastate conflict as new only if the conflict started that year, not if it became internationalized. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + We count a non-internationalized intrastate conflict as new only if the conflict started that year, not if it stopped being international. + <%- endif -%> + +tables: + # PARTICIPANT INDICATORS + ucdp_monthly_country: + common: + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + description_key: + - "{definitions.all.conflict_type}" + - A country is considered to participate in a conflict if they were a primary participant, which refers to those participants that have the main disagreement of the conflict. + + variables: + participated_in_conflict: + title: State involved in conflict + unit: "" + description_short: |- + <%- if conflict_type == "state-based" -%> + State was a primary participant in at least one interstate, intrastate, or extrasystemic conflict that year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + State was a primary participants in at least one internationalized intrastate conflict that year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + State was a primary participants in at least one non-internationalized intrastate conflict that year. + + <%- elif conflict_type == "one-sided violence" -%> + State was a primary participants in at least one instance of one-sided violence that year. + + <% else -%> + State was a primary participants in at least one << conflict_type >> conflict that year. + + <%- endif -%> + description_key: + - |- + '1' indicates that the state participated in a conflict. '0' indicates that the state did not participate in a conflict. + + number_participants: + title: Number of states involved in conflicts + unit: "states" + description_short: |- + <%- if conflict_type == "state-based" -%> + Included are states that were primary participants in at least one interstate, intrastate, or extrasystemic conflict that year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + Included are states that were primary participants in at least one internationalized intrastate conflict that year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + Included are states that were primary participants in at least one non-internationalized intrastate conflict that year. + + <%- elif conflict_type == "one-sided violence" -%> + Included are states that were primary participants in at least one instance of one-sided violence that year. + + <% else -%> + Included are states that were primary participants in at least one << conflict_type >> conflict that year. + + <%- endif -%> + + # LOCATION INDICATORS + ucdp_monthly_locations: + common: + description_processing: |- + {definitions.all.location_conflicts_method} + description_key: + - "{definitions.all.conflict_type}" + + variables: + is_location_of_conflict: + title: Country where conflict took place + unit: "" + description_short: |- + <%- if conflict_type == "state-based" -%> + At least one interstate, intrastate, or extrasystemic conflict event took place in this country in a given year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + At least one internationalised intrastate conflict event took place in this country in a given year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + At least one non-internationalized intrastate conflict event took place in this country in a given year. + + <%- elif conflict_type == "one-sided violence" -%> + At least one conflict event took place in this country in a given year. + + <% else -%> + At least one << conflict_type >> conflict event took place in this country in a given year. + + <%- endif -%> + description_key: + - |- + '1' indicates that there was a conflict event in the given country. '0' indicates that there was no conflict event in the given country. + - "{definitions.all.conflict_type}" + + number_locations: + title: Number of countries where conflict took place + unit: "countries" + description_short: |- + <%- if conflict_type == "all" -%> + Included are armed conflicts that caused at least one death in the country that year. + + <%- elif conflict_type == "state-based" -%> + Included are interstate, intrastate, and extrasystemic conflicts that caused at least one death in the country that year. + + <%- elif conflict_type == "intrastate (internationalized)" -%> + Included are internationalized conflicts that caused at least one death in the country that year. + + <%- elif conflict_type == "intrastate (non-internationalized)" -%> + Included are non-internationalized conflicts that caused at least one death in the country that year. + + <%- elif conflict_type == "one-sided violence" -%> + Included is one-sided violence that caused at least one death in the country that year. + + <% else -%> + Included are << conflict_type >> conflicts that caused at least one death in the country that year. + + <%- endif -%> + + number_deaths: + title: Deaths in ongoing conflicts in a country (best estimate) + unit: "deaths" + description_short: |- + <% set estimate = "best" %> + {definitions.number_deaths.description_short} + + number_deaths_low: + title: Deaths in ongoing conflicts in a country (low estimate) + unit: "deaths" + description_short: |- + <% set estimate = "low" %> + {definitions.number_deaths.description_short} + + number_deaths_high: + title: Deaths in ongoing conflicts in a country (high estimate) + unit: "deaths" + description_short: |- + <% set estimate = "high" %> + {definitions.number_deaths.description_short} + + number_deaths_combatants: + title: Deaths of combatants in ongoing conflicts in a country + unit: "deaths" + description_short: |- + <% set people_type = "combatants" %> + {definitions.number_deaths_type.description_short} + + number_deaths_civilians: + title: Deaths of civilians in ongoing conflicts in a country + unit: "deaths" + description_short: |- + <% set people_type = "civilians" %> + {definitions.number_deaths_type.description_short} + + number_deaths_unknown: + title: Deaths of unknown type in ongoing conflicts in a country + unit: "deaths" + description_short: |- + <% set people_type = "unknown type" %> + {definitions.number_deaths_type.description_short} + + death_rate: + title: Death rate in ongoing conflicts in a country (best estimate) + unit: "deaths per 100,000 people" + display: + numDecimalPlaces: 1 + description_short: |- + <% set estimate = "best" %> + {definitions.number_deaths.description_short_per_capita} + + death_rate_low: + title: Death rate in ongoing conflicts in a country (low estimate) + unit: "deaths per 100,000 people" + display: + numDecimalPlaces: 1 + description_short: |- + <% set estimate = "low" %> + {definitions.number_deaths.description_short_per_capita} + + death_rate_high: + title: Death rate in ongoing conflicts in a country (high estimate) + unit: "deaths per 100,000 people" + display: + numDecimalPlaces: 1 + description_short: |- + <% set estimate = "high" %> + {definitions.number_deaths.description_short_per_capita} + + # death_rate_combatants: + # title: Death rate of combatants in ongoing conflicts in a country + # unit: "deaths per 100,000 people" + # display: + # numDecimalPlaces: 1 + # description_short: |- + # <% set people_type = "combatants" %> + # {definitions.number_deaths_type.description_short_per_capita} + + # death_rate_civilians: + # title: Death rate of civilians in ongoing conflicts in a country + # unit: "deaths per 100,000 people" + # display: + # numDecimalPlaces: 1 + # description_short: |- + # <% set people_type = "civilians" %> + # {definitions.number_deaths_type.description_short_per_capita} + + # death_rate_unknown: + # title: Death rate of unknown type in ongoing conflicts in a country + # unit: "deaths per 100,000 people" + # display: + # numDecimalPlaces: 1 + # description_short: |- + # <% set people_type = "unknown type" %> + # {definitions.number_deaths_type.description_short_per_capita} + + # MAIN INDICATORS + ucdp_monthly: + common: + presentation: + grapher_config: + selectedEntityNames: + - Africa + - Americas + - Asia and Oceania + - Europe + - Middle East + variables: + ################## + # Ongoing deaths # + ################## + ## Estimated deaths + number_deaths_ongoing_conflicts: + title: Deaths in ongoing conflicts (best estimate) + unit: deaths + description_short: |- + <% set estimate = "best" %> + {definitions.number_deaths.description_short} + description_key: *description_key_deaths + + number_deaths_ongoing_conflicts_high: + title: Deaths in ongoing conflicts (high estimate) + unit: deaths + description_short: |- + <% set estimate = "high" %> + {definitions.number_deaths.description_short} + description_key: *description_key_deaths + + number_deaths_ongoing_conflicts_low: + title: Deaths in ongoing conflicts (low estimate) + unit: deaths + description_short: |- + <% set estimate = "low" %> + {definitions.number_deaths.description_short} + description_key: *description_key_deaths + + ## Deaths by type + number_deaths_ongoing_conflicts_civilians: + title: Deaths of civilians in ongoing conflicts + unit: deaths + description_short: |- + <% set people_type = "civilians" %> + {definitions.number_deaths_type.description_short} + description_key: *description_key_deaths_type + + number_deaths_ongoing_conflicts_combatants: + title: Deaths of combatants in ongoing conflicts + unit: deaths + description_short: |- + <% set people_type = "combatants" %> + {definitions.number_deaths_type.description_short} + description_key: *description_key_deaths_type + + number_deaths_ongoing_conflicts_unknown: + title: Deaths of unknown type in ongoing conflicts + unit: deaths + description_short: |- + <% set people_type = "unknown type" %> + {definitions.number_deaths_type.description_short} + description_key: *description_key_deaths_type + + ## Deaths per capita + number_deaths_ongoing_conflicts_per_capita: + title: Death rate in ongoing conflicts (best estimate) + unit: deaths per 100,000 people + description_short: |- + <% set estimate = "best" %> + {definitions.number_deaths.description_short_per_capita} + description_key: *description_key_deaths + display: + numDecimalPlaces: 1 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_deaths_ongoing_conflicts_high_per_capita: + title: Death rate in ongoing conflicts (high estimate) + unit: deaths per 100,000 people + description_short: |- + <% set estimate = "high" %> + {definitions.number_deaths.description_short_per_capita} + description_key: *description_key_deaths + display: + numDecimalPlaces: 1 + + number_deaths_ongoing_conflicts_low_per_capita: + title: Death rate in ongoing conflicts (low estimate) + unit: deaths per 100,000 people + description_short: |- + <% set estimate = "low" %> + {definitions.number_deaths.description_short_per_capita} + description_key: *description_key_deaths_type + display: + numDecimalPlaces: 1 + + # number_deaths_ongoing_conflicts_civilians_per_capita: + # title: Death rate from civilians in ongoing conflicts + # unit: deaths + # description_short: |- + # <% set people_type = "civilians" %> + # {definitions.number_deaths_type.description_short_per_capita} + # description_key: *description_key_deaths_type + # display: + # numDecimalPlaces: 1 + + # number_deaths_ongoing_conflicts_combatants_per_capita: + # title: Death rate from combatants ongoing conflicts + # unit: deaths + # description_short: |- + # <% set people_type = "combatants" %> + # {definitions.number_deaths_type.description_short_per_capita} + # description_key: *description_key_deaths_type + # display: + # numDecimalPlaces: 1 + + + # number_deaths_ongoing_conflicts_unknown_per_capita: + # title: Death rate from unknown type in ongoing conflicts + # unit: deaths + # description_short: |- + # <% set people_type = "unknown type" %> + # {definitions.number_deaths_type.description_short_per_capita} + # description_key: *description_key_deaths + # display: + # numDecimalPlaces: 1 + + + ##################### + # Ongoing conflicts # + ##################### + number_ongoing_conflicts: + title: Number of ongoing conflicts + unit: conflicts + description_short: |- + {definitions.number_ongoing_conflicts.description_short} + description_key: *description_key_ongoing + presentation: + grapher_config: + selectedEntityNames: + - World + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_ongoing_conflicts_per_country: + title: Number of ongoing conflicts per state + unit: conflicts per state + description_short: |- + The number of conflicts divided by the number of all states. This accounts for the changing number of states over time. {definitions.number_ongoing_conflicts.description_short} + description_key: *description_key_ongoing + display: + numDecimalPlaces: 3 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_ongoing_conflicts_per_country_pair: + title: Number of ongoing conflicts per state-pair + unit: conflicts per state-pair + description_short: |- + The number of conflicts divided by the number of all state-pairs. This accounts for the changing number of states over time. {definitions.number_ongoing_conflicts.description_short} + description_key: *description_key_ongoing + display: + numDecimalPlaces: 5 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + ################# + # New conflicts # + ################# + number_new_conflicts: + title: Number of new conflicts + unit: conflicts + description_short: |- + {definitions.number_new_conflicts.description_short} + description_key: *description_key_new + presentation: # TODO + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_new_conflicts_per_country: + title: Number of new conflicts per state + unit: conflicts per state + description_short: |- + The number of conflicts divided by the number of all states. This accounts for the changing number of states over time. {definitions.number_new_conflicts.description_short} + description_key: *description_key_new + display: + numDecimalPlaces: 3 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + + number_new_conflicts_per_country_pair: + title: Number of new conflicts per state-pair + unit: conflicts per state-pair + description_short: |- + The number of conflicts divided by the number of all state-pairs. This accounts for the changing number of states over time. {definitions.number_new_conflicts.description_short} + description_key: *description_key_new + display: + numDecimalPlaces: 5 + presentation: + attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) + +dataset: + title: UCDP, History of war (monthly) + description: |- + This dataset provides information on armed conflicts, using data from the UCDP Georeferenced Event Dataset (version 23.1), the UCDP/PRIO Armed Conflict Dataset (version 23.1), and the UCDP Battle-Related Deaths Dataset (version 23.1). + + We aggregate the UCDP Georeferenced Event Dataset up to the year and world (region) to identify all conflict deaths, non-state conflicts, and one-sided violence. + + We use the UCDP/PRIO Armed Conflict Dataset to identify state-based conflicts: interstate, intrastate (all, internationalized, and non-internationalized), and extrasystemic. + + We use the UCDP Battle-Related Deaths Dataset to link deaths in the Georeferenced Event Dataset to types of state-based conflicts in the UCDP/PRIO Armed Conflict Dataset. + + We combine these datasets to provide information on the number of ongoing and new conflicts, the number of ongoing and new conflict types, as well as the number of deaths in ongoing conflicts and conflict types. + + Deaths of combatants and civilians due to fighting are included. + + The Georeferenced Event Dataset has been extracted from the UCDP systems at a certain point in time. However, the UCDP team works with the data all year round, including revisions and updates. Therefore, their dashboard + might show slightly more up-to-date data, which sometimes result in minor discrepancies in the data. + + We use the world regions as defined by UCDP/PRIO: Africa, Americas, Asia, Europe, and Middle East. These are defined based on Gleditsch and Ward codes. Find the complete mapping at + http://ksgleditsch.com/data/iisystem.dat (states) and http://ksgleditsch.com/data/microstatessystem.dat (micro-states): + + • Americas: 2-199 + + • Europe: 200-399 + + • Africa: 400-626 + + • Middle East: 630-699 + + • Asia and Oceania: 700-999 + + You can find more information about the data in our article: [To be published] + + This dataset contains information on armed conflicts - state, non-state and one-sided conflicts, in the period of 1989 and 2022. diff --git a/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.py b/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.py new file mode 100644 index 00000000000..2cad268e730 --- /dev/null +++ b/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.py @@ -0,0 +1,1358 @@ +"""Data from UCDP. + + +Notes: + - Conflict types for state-based violence is sourced from UCDP/PRIO dataset. non-state and one-sided violence is sourced from GED dataset. + - There can be some mismatches with latest official reported data (UCDP's live dashboard). This is because UCDP uses latest data for their dashboard, which might not be available yet as bulk download. + - Regions: + - Uses `region` column for both GED and UCDP/PRIO datasets. + - Incompatibilities in Oceania are encoded in "Asia". We therefore have changed the region name to "Asia and Oceania". + - GED: Dataset uses names (not codes!) + - You can learn more about the countries included in each region from section "Appendix 5 Main sources consulted during the 2022 update" in page 40, + document: https://ucdp.uu.se/downloads/ged/ged231.pdf. + - Note that countries from Oceania are included in Asia! + - UCDP/PRIO: Dataset uses codes (note we changed "Asia" -> "Asia and Oceania") + 1 = Europe (GWNo: 200-399) + 2 = Middle East (GWNo: 630-699) + 3 = Asia (GWNo: 700-999) [renamed to 'Asia and Oceania'] + 4 = Africa (GWNo: 400-626) + 5 = Americas (GWNo: 2-199) +""" + +from datetime import datetime +from typing import List, Optional + +import geopandas as gpd +import numpy as np +import pandas as pd +from owid.catalog import Dataset, Table +from owid.catalog import processing as pr +from shapely import wkt +from shared import ( + add_indicators_extra, + aggregate_conflict_types, + get_number_of_countries_in_conflict_by_region, +) +from structlog import get_logger + +from etl.data_helpers import geo +from etl.data_helpers.misc import expand_time_column +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Mapping for Geo-referenced datase +TYPE_OF_VIOLENCE_MAPPING = { + 2: "non-state conflict", + 3: "one-sided violence", +} +# Mapping for armed conflicts dataset (inc PRIO/UCDP) +TYPE_OF_CONFLICT_MAPPING = { + 1: "extrasystemic", + 2: "interstate", + 3: "intrastate (non-internationalized)", + 4: "intrastate (internationalized)", +} +# Regions mapping (for PRIO/UCDP dataset) +REGIONS_MAPPING = { + 1: "Europe", + 2: "Middle East", + 3: "Asia and Oceania", + 4: "Africa", + 5: "Americas", +} +REGIONS_EXPECTED = set(REGIONS_MAPPING.values()) +# Last year of data +LAST_YEAR_STABLE = 2023 +LAST_YEAR_CED = 2024 +LAST_YEAR = 2023 + + +def run(dest_dir: str) -> None: + paths.log.info("start") + + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("ucdp") + ds_ced = paths.load_dataset("ucdp_ced") + + # Read table from GW codes + ds_gw = paths.load_dataset("gleditsch") + tb_regions = ds_gw.read("gleditsch_regions") + tb_codes = ds_gw["gleditsch_countries"] + + # Load maps table + short_name = "nat_earth_110" + ds_maps = paths.load_dataset(short_name) + tb_maps = ds_maps.read(short_name) + + # Load population + ds_population = paths.load_dataset("population") + + # + # Process data. + # + paths.log.info("sanity checks") + _sanity_checks(ds_meadow) + + # Load relevant tables + tb_ged = ( + ds_meadow.read("ucdp_ged") + .reset_index() + .astype( + { + "deaths_a": float, + "deaths_b": float, + "deaths_civilians": float, + "deaths_unknown": float, + "best": float, + "high": float, + "low": float, + } + ) + ) + tb_ced = ( + ds_ced.read("ucdp_ced") + .reset_index() + .astype( + { + "deaths_a": float, + "deaths_b": float, + "deaths_civilians": float, + "deaths_unknown": float, + "best": float, + "high": float, + "low": float, + } + ) + ) + tb_conflict = ( + ds_meadow.read("ucdp_battle_related_conflict") + .reset_index() + .astype( + { + "bd_best": float, + "bd_low": float, + "bd_high": float, + } + ) + ) + tb_prio = ds_meadow.read("ucdp_prio_armed_conflict") + + # Extend codes to have data for latest years + tb_codes = extend_latest_years(tb_codes) + + # Merge CED into GED + assert (tb_ced.columns == tb_ged.columns).all(), "Columns are not the same!" + assert tb_ged["year"].max() == LAST_YEAR_STABLE, "GED data is not up to date!" + assert tb_ced["year"].max() == LAST_YEAR_CED, "CED data is not up to date!" + tb_ced = tb_ced[tb_ged.columns] + tb_ged = pr.concat([tb_ged, tb_ced], ignore_index=True) + + # Keep only active conflicts + paths.log.info("keep active conflicts") + tb_ged = tb_ged.loc[tb_ged["active_year"] == 1] + + # Change region named "Asia" to "Asia and Oceania" (in GED) + tb_ged["region"] = tb_ged["region"].replace({"Asia": "Asia and Oceania"}) + + # Create `conflict_type` column + paths.log.info("add field `conflict_type`") + tb = add_conflict_type(tb_ged, tb_conflict) + + # Get country-level stuff + paths.log.info("getting country-level indicators") + tb_participants = estimate_metrics_participants(tb, tb_prio, tb_codes) + tb_locations = estimate_metrics_locations(tb, tb_maps, tb_codes, ds_population) + + # Sanity check conflict_type transitions + ## Only consider transitions between intrastate and intl intrastate. If other transitions are detected, raise error. + _sanity_check_conflict_types(tb) + _sanity_check_prio_conflict_types(tb_prio) + + # Add number of new conflicts and ongoing conflicts (also adds data for the World) + paths.log.info("get metrics for main dataset (also estimate values for 'World')") + tb = estimate_metrics(tb) + + # Add table from UCDP/PRIO + paths.log.info("prepare data from ucdp/prio table (also estimate values for 'World')") + tb_prio = prepare_prio_data(tb_prio) + + # Fill NaNs + paths.log.info("replace missing data with zeros (where applicable)") + tb_prio = expand_time_column( + tb_prio, + dimension_col=["region", "conflict_type"], + time_col="year", + method="full_range", + fillna_method="zero", + ) + tb = expand_time_column( + tb, + dimension_col=["region", "conflict_type"], + time_col="year", + method="full_range", + fillna_method="zero", + ) + # Combine main dataset with PRIO/UCDP + paths.log.info("add data from ucdp/prio table") + tb = combine_tables(tb, tb_prio) + + # Add extra-systemic after 1989 + paths.log.info("fix extra-systemic nulls") + tb = fix_extrasystemic_entries(tb) + + # Add data for "all conflicts" conflict type + paths.log.info("add data for 'all conflicts'") + tb = add_conflict_all(tb) + + # Add data for "all intrastate" conflict types + tb = add_conflict_all_intrastate(tb) + + # Add data for "state-based" conflict types + tb = add_conflict_all_statebased(tb) + + # Force types + # tb = tb.astype({"conflict_type": "category", "region": "category"}) + + # Add conflict rates + tb = add_indicators_extra( + tb, + tb_regions, + columns_conflict_rate=["number_ongoing_conflicts", "number_new_conflicts"], + columns_conflict_mortality=[ + "number_deaths_ongoing_conflicts", + "number_deaths_ongoing_conflicts_high", + "number_deaths_ongoing_conflicts_low", + # "number_deaths_ongoing_conflicts_civilians", + # "number_deaths_ongoing_conflicts_unknown", + # "number_deaths_ongoing_conflicts_combatants", + ], + ) + + # Adapt region names + tb = adapt_region_names(tb) + + # Tables + tables = [ + tb.format(["year", "region", "conflict_type"], short_name=paths.short_name), + tb_participants.format(["year", "country", "conflict_type"]), + tb_locations.format(["year", "country", "conflict_type"]), + ] + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + paths.log.info("ucdp.end") + + +def _sanity_checks(ds: Dataset) -> None: + """Check that the tables in the dataset are as expected.""" + + def _check_consistency_of_ged( + tb_ged: Table, + tb_type: Table, + death_col: str, + type_of_violence: int, + conflict_ids_errors: Optional[List[int]] = None, + ): + ERR_THRESHOLD = 0.015 + + # Check IDs + ged_ids = tb_ged.loc[tb_ged["type_of_violence"] == type_of_violence, ["conflict_new_id"]].drop_duplicates() + conflict_ids = tb_type[["conflict_id"]].drop_duplicates() + res = ged_ids.merge(conflict_ids, left_on="conflict_new_id", right_on="conflict_id", how="outer") + assert res.isna().sum().sum() == 0, "Check NaNs in conflict_new_id or conflict_id" + + # Check number of deaths + deaths_ged = ( + tb_ged.loc[(tb_ged["type_of_violence"] == type_of_violence) & (tb_ged["active_year"] == 1)] + .groupby(["conflict_new_id", "year"], as_index=False)[["best"]] + .sum() + .sort_values(["conflict_new_id", "year"]) + ) + deaths = tb_type[["conflict_id", "year", death_col]].sort_values(["conflict_id", "year"]) + res = deaths_ged.merge( + deaths, left_on=["conflict_new_id", "year"], right_on=["conflict_id", "year"], how="outer" + ) + + # Get error + res["err"] = res["best"].astype(float) - res[death_col].astype(float) + res["err_rel"] = res["err"] / res["best"] + res = res[res["err_rel"] > ERR_THRESHOLD] + # Remove accepted errors + if conflict_ids_errors is not None: + res = res.loc[~res["conflict_new_id"].isin(conflict_ids_errors)] + assert ( + len(res) == 0 + ), f"Dicrepancy between number of deaths in conflict ({tb_ged.m.short_name} vs. {tb_type.m.short_name}). \n {res})" + + # Read tables + tb_ged = ds["ucdp_ged"].reset_index() + tb_conflict = ds["ucdp_battle_related_conflict"].reset_index() + tb_nonstate = ds["ucdp_non_state"].reset_index() + tb_onesided = ds["ucdp_one_sided"].reset_index() + + # Battle-related conflict # + _check_consistency_of_ged( + tb_ged, + tb_conflict, + "bd_best", + 1, + ) + + # Non-state # + _check_consistency_of_ged( + tb_ged, + tb_nonstate, + "best_fatality_estimate", + 2, + [16009], + ) + + # One-sided # + _check_consistency_of_ged( + tb_ged, + tb_onesided, + "best_fatality_estimate", + 3, + [16009], + ) + + +def add_conflict_type(tb_ged: Table, tb_conflict: Table) -> Table: + """Add `conflict_type` to georeferenced dataset table. + + Values for conflict_type are: + - non-state conflict + - one-sided violence + - extrasystemic + - interstate + - intrastate + - internationalized intrastate + + The thing is that the original table `tb_ged` only contains a very high level categorisation. In particular, + it labels all state-based conflicts as 'state-based'. Instead, we want to use a more fine grained definition: + extrasystemic, intrastate, interstate. + + Parameters + ---------- + tb_ged: Table + This is the main table with the relevant data + tb_conflict: Table + This is a secondary table, that we use to obtain the conflict types of the conflicts. + """ + tb_conflict = tb_conflict[["conflict_id", "year", "type_of_conflict"]].drop_duplicates() + assert tb_conflict.groupby(["conflict_id", "year"]).size().max() == 1, "Some conflict_id-year pairs are duplicated!" + + # Add `type_of_conflict` to `tb_ged`. + # This column contains the type of state-based conflict (1: inter-state, 2: intra-state, 3: extra-state, 4: internationalized intrastate) + tb_ged = tb_ged.merge( + tb_conflict, left_on=["conflict_new_id", "year"], right_on=["conflict_id", "year"], how="outer" + ) + # Fill unknown types of violence + mask = tb_ged["type_of_violence"] == 1 # these are state-based conflicts + tb_ged["type_of_conflict"] = tb_ged["type_of_conflict"].astype(object) + tb_ged.loc[mask, "type_of_conflict"] = tb_ged.loc[mask, "type_of_conflict"].fillna("state-based (unknown)") + + # Assert that `type_of_conflict` was only added for state-based events + assert ( + tb_ged[tb_ged["type_of_violence"] != 1]["type_of_conflict"].isna().all() + ), "There are some actual values for non-state based conflicts! These should only be NaN, since `tb_conflict` should only contain data for state-based conflicts." + # Check that `type_of_conflict` is not NaN for state-based events + assert ( + not tb_ged[tb_ged["type_of_violence"] == 1]["type_of_conflict"].isna().any() + ), "Could not find the type of conflict for some state-based conflicts!" + + # Create `conflict_type` column as a combination of `type_of_violence` and `type_of_conflict`. + tb_ged["conflict_type"] = ( + tb_ged["type_of_conflict"] + .astype(object) + .replace(TYPE_OF_CONFLICT_MAPPING) + .fillna(tb_ged["type_of_violence"].astype(object).replace(TYPE_OF_VIOLENCE_MAPPING)) + ) + + # Sanity check + assert tb_ged["conflict_type"].isna().sum() == 0, "Check NaNs in conflict_type (i.e. conflicts without a type)!" + return tb_ged + + +def _sanity_check_conflict_types(tb: Table) -> Table: + """Check conflict type. + + - Only transitions accepted are between intrastate conflicts. + - The same conflict is only expceted to have one type in a year. + """ + # Define expected combinations of conflicT_types for a conflict. Typically, only in the intrastate domain + TRANSITION_EXPECTED = {"intrastate (internationalized)", "intrastate (non-internationalized)"} + # Get conflicts with more than one conflict type assigned to them over their lifetime + tb_ = tb.loc[tb["year"] < LAST_YEAR_STABLE] + conflict_type_transitions = tb_.groupby("conflict_new_id")["conflict_type"].apply(set) + transitions = conflict_type_transitions[conflict_type_transitions.apply(len) > 1].drop_duplicates() + # Extract unique combinations of conflict_types for a conflict + assert (len(transitions) == 1) & (transitions.iloc[0] == TRANSITION_EXPECTED), "Error" + + # Check if different regions categorise the conflict differently in the same year + assert not ( + tb_.groupby(["conflict_id", "year"])["type_of_conflict"].nunique() > 1 + ).any(), "Seems like the conflict has multiple types for a single year! Is it categorised differently depending on the region? This case has not been taken into account -- please review the code!" + + +def _sanity_check_prio_conflict_types(tb: Table) -> Table: + """Check conflict type in UCDP/PRIO data. + + - Only transitions accepted between intrastate conflicts. + - The same conflict is only expceted to have one type in a year. + """ + # Define expected combinations of conflict_types for a conflict. Typically, only in the intrastate domain + TRANSITIONS_EXPECTED = {"{3, 4}"} + # Get conflicts with more than one conflict type assigned to them over their lifetime + conflict_type_transitions = tb.groupby("conflict_id")["type_of_conflict"].apply(set) + transitions = conflict_type_transitions[conflict_type_transitions.apply(len) > 1].drop_duplicates() + # Extract unique combinations of conflict_types for a conflict + transitions = set(transitions.astype(str)) + transitions_unk = transitions - TRANSITIONS_EXPECTED + + # Check if different regions categorise the conflict differently in the same year + assert not ( + tb.groupby(["conflict_id", "year"])["type_of_conflict"].nunique() > 1 + ).any(), "Seems like the conflict hast multiple types for a single year! Is it categorised differently depending on the region?" + + assert not transitions_unk, f"Unknown transitions found: {transitions_unk}" + + +def estimate_metrics(tb: Table) -> Table: + """Add number of ongoing and new conflicts, and number of deaths. + + It also estimates the values for 'World', otherwise this can't be estimated later on. + This is because some conflicts occur in multiple regions, and hence would be double counted. To overcome this, + we need to access the actual conflict_id field to find the number of unique values. This can only be done here. + """ + # Get number of ongoing conflicts, and deaths in ongoing conflicts + paths.log.info("get number of ongoing conflicts and deaths in ongoing conflicts") + tb_ongoing = _get_ongoing_metrics(tb) + + # Get number of new conflicts every year + paths.log.info("get number of new conflicts every year") + tb_new = _get_new_metrics(tb) + # Combine and build single table + paths.log.info("combine and build single table") + tb = tb_ongoing.merge( + tb_new, + left_on=["year", "region", "conflict_type"], + right_on=["year", "region", "conflict_type"], + how="outer", # data for (1991, intrastate) is available for 'ongoing conflicts' but not for 'new conflicts'. We don't want to loose it! + ) + + # If datapoint is missing, fill with zero + tb = tb.fillna(0) + + # tb = tb.drop(columns=["year_start"]) + return tb + + +def _get_ongoing_metrics(tb: Table) -> Table: + # Estimate combatant deaths per conflict + tb_ = tb.copy() + tb_["deaths_combatants"] = tb_["deaths_a"] + tb_["deaths_b"] + + # Define aggregations + column_props = { + # Deaths (estimates) + "best": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts", + }, + "high": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_high", + }, + "low": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_low", + }, + # Deaths by type + "deaths_civilians": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_civilians", + }, + "deaths_unknown": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_unknown", + }, + "deaths_combatants": { + "f": "sum", + "rename": "number_deaths_ongoing_conflicts_combatants", + }, + # Number of conflicts + "conflict_new_id": { + "f": "nunique", + "rename": "number_ongoing_conflicts", + }, + } + col_funcs = {k: v["f"] for k, v in column_props.items()} + col_renames = {k: v["rename"] for k, v in column_props.items()} + # For each region + columns_idx = ["year", "region", "conflict_type"] + tb_ongoing = tb_.groupby(columns_idx, as_index=False).agg(col_funcs) + tb_ongoing = tb_ongoing.rename(columns={n: n for n in columns_idx} | col_renames) + + # For the World + columns_idx = ["year", "conflict_type"] + tb_ongoing_world = tb_.groupby(columns_idx, as_index=False).agg(col_funcs) + tb_ongoing_world = tb_ongoing_world.rename(columns={n: n for n in columns_idx} | col_renames) + tb_ongoing_world["region"] = "World" + + # Combine + tb_ongoing = pr.concat([tb_ongoing, tb_ongoing_world], ignore_index=True).sort_values( # type: ignore + by=["year", "region", "conflict_type"] + ) + + # Check that `deaths = deaths_combatants + deaths_civilians + deaths_unknown` holds + assert ( + tb_ongoing["number_deaths_ongoing_conflicts"] + - tb_ongoing[ + [ + "number_deaths_ongoing_conflicts_civilians", + "number_deaths_ongoing_conflicts_unknown", + "number_deaths_ongoing_conflicts_combatants", + ] + ].sum(axis=1) + == 0 + ).all(), "Sum of deaths from combatants, civilians and unknown should equal best estimate!" + return tb_ongoing + + +def _get_new_metrics(tb: Table) -> Table: + # Reduce table to only preserve first appearing event + tb = ( + tb.loc[:, ["conflict_new_id", "year", "region", "conflict_type"]] + .sort_values("year") + .drop_duplicates(subset=["conflict_new_id", "region"], keep="first") + ) + + # For each region + columns_idx = ["year", "region", "conflict_type"] + tb_new = tb.groupby(columns_idx)[["conflict_new_id"]].nunique().reset_index() + tb_new.columns = columns_idx + ["number_new_conflicts"] + + # For the World + ## Consider first start globally (a conflict may have started in region A in year X and in region B later in year X + 1) + tb = tb.sort_values("year").drop_duplicates(subset=["conflict_new_id"], keep="first") + columns_idx = ["year", "conflict_type"] + tb_new_world = tb.groupby(columns_idx)[["conflict_new_id"]].nunique().reset_index() + tb_new_world.columns = columns_idx + ["number_new_conflicts"] + tb_new_world["region"] = "World" + + # Combine + tb_new = pr.concat([tb_new, tb_new_world], ignore_index=True).sort_values( # type: ignore + by=["year", "region", "conflict_type"] + ) + + return tb_new + + +def prepare_prio_data(tb_prio: Table) -> Table: + """Prepare PRIO table. + + This includes estimating all necessary metrics (ongoing and new). + """ + tb_prio = _prepare_prio_table(tb_prio) + tb_prio = _prio_add_metrics(tb_prio) + return tb_prio + + +def combine_tables(tb: Table, tb_prio: Table) -> Table: + """Combine main table with data from UCDP/PRIO. + + UCDP/PRIO table provides estimates for dates earlier then 1989. + + It only includes state-based conflicts! + """ + # Ensure year period for each table is as expected + assert tb["year"].min() == 1989, "Unexpected start year!" + assert tb["year"].max() == LAST_YEAR_CED, "Unexpected start year!" + assert tb_prio["year"].min() == 1946, "Unexpected start year!" + assert tb_prio["year"].max() == 1989, "Unexpected start year!" + + # Force NaN in 1989 data from Geo-referenced dataset for `number_new_conflicts` + # We want this data to come from PRIO/UCDP instead! + tb.loc[tb["year"] == 1989, "number_new_conflicts"] = np.nan + # Force NaN in 1989 data from PRIO/UCDP dataset for `number_ongoing_conflicts` + # We want this data to come from GEO instead! + tb_prio.loc[tb_prio["year"] == 1989, "number_ongoing_conflicts"] = np.nan + + # Merge Geo with UCDP/PRIO + tb = tb_prio.merge(tb, on=["year", "region", "conflict_type"], suffixes=("_prio", "_main"), how="outer") + + # Sanity checks + ## Data from PRIO/UCDP for `number_ongoing_conflicts` goes from 1946 to 1988 (inc) + assert tb[tb["number_ongoing_conflicts_prio"].notna()]["year"].min() == 1946 + assert tb[tb["number_ongoing_conflicts_prio"].notna()]["year"].max() == 1988 + ## Data from GEO for `number_ongoing_conflicts` goes from 1989 to 2023 (inc) + assert tb[tb["number_ongoing_conflicts_main"].notna()].year.min() == 1989 + assert tb[tb["number_ongoing_conflicts_main"].notna()]["year"].max() == LAST_YEAR_CED + ## Data from PRIO/UCDP for `number_new_conflicts` goes from 1946 to 1989 (inc) + assert tb[tb["number_new_conflicts_prio"].notna()]["year"].min() == 1946 + assert tb[tb["number_new_conflicts_prio"].notna()]["year"].max() == 1989 + ## Data from GEO for `number_new_conflicts` goes from 1990 to 2022 (inc) + assert tb[tb["number_new_conflicts_main"].notna()]["year"].min() == 1990 + assert tb[tb["number_new_conflicts_main"].notna()]["year"].max() == LAST_YEAR_CED + + # Actually combine timeseries from UCDP/PRIO and GEO. + # We prioritise values from PRIO for 1989, therefore the order `PRIO.fillna(MAIN)` + tb["number_ongoing_conflicts"] = tb["number_ongoing_conflicts_prio"].fillna(tb["number_ongoing_conflicts_main"]) + tb["number_new_conflicts"] = tb["number_new_conflicts_prio"].fillna(tb["number_new_conflicts_main"]) + + # Remove unnecessary columns + columns_remove = tb.filter(regex=r"(_prio|_main)").columns + tb = tb[[col for col in tb.columns if col not in columns_remove]] + + return tb + + +def fix_extrasystemic_entries(tb: Table) -> Table: + """Fix entries with conflict_type='extrasystemic'. + + Basically means setting to zero null entries after 1989. + """ + # Sanity check + assert ( + tb.loc[tb["conflict_type"] == "extrasystemic", "year"].max() == 1989 + ), "There are years beyond 1989 for extrasystemic conflicts by default!" + + # Get only extra-systemic stuff + mask = tb.conflict_type == "extrasystemic" + tb_extra = tb.loc[mask].copy() + + # add all combinations + years = np.arange(tb["year"].min(), tb["year"].max() + 1) + regions = set(tb["region"]) + new_idx = pd.MultiIndex.from_product([years, regions], names=["year", "region"]) + tb_extra = tb_extra.set_index(["year", "region"]).reindex(new_idx).reset_index() + tb_extra["conflict_type"] = "extrasystemic" + + # Replace nulls with zeroes (all time series) + columns = [ + "number_ongoing_conflicts", + "number_new_conflicts", + ] + tb_extra[columns] = tb_extra[columns].fillna(0) + + # Replace nulls with zeroes (only post 1989 time series) + columns = [ + "number_deaths_ongoing_conflicts", + "number_deaths_ongoing_conflicts_high", + "number_deaths_ongoing_conflicts_low", + "number_deaths_ongoing_conflicts_civilians", + "number_deaths_ongoing_conflicts_unknown", + "number_deaths_ongoing_conflicts_combatants", + ] + mask_1989 = tb_extra["year"] >= 1989 + tb_extra.loc[mask_1989, columns] = tb_extra.loc[mask_1989, columns].fillna(0) + + # Add to main table + tb = pr.concat([tb[-mask], tb_extra]) + return tb + + +def _prepare_prio_table(tb: Table) -> Table: + # Select relevant columns + tb = tb[["conflict_id", "year", "region", "type_of_conflict", "start_date"]] + + # Flatten (some entries have multiple regions, e.g. `1, 2`). This should be flattened to multiple rows. + # https://stackoverflow.com/a/42168328/5056599 + tb["region"] = tb["region"].str.split(", ") + cols = tb.columns[tb.columns != "region"].tolist() + tb = tb[cols].join(tb["region"].apply(pd.Series)) + tb = tb.set_index(cols).stack().reset_index() + tb = tb.drop(tb.columns[-2], axis=1).rename(columns={0: "region"}) + tb["region"] = tb["region"].astype(int) + + # Obtain start year of the conflict + tb["year_start"] = pd.to_datetime(tb["start_date"]).dt.year + + # Rename regions + tb["region"] = tb["region"].map(REGIONS_MAPPING) + + # Create conflict_type + tb["conflict_type"] = tb["type_of_conflict"].map(TYPE_OF_CONFLICT_MAPPING) + + # Checks + assert tb["conflict_type"].isna().sum() == 0, "Some unknown conflict type ids were found!" + assert tb["region"].isna().sum() == 0, "Some unknown region ids were found!" + + # Filter only data from the first year with ongoing conflicts + tb = tb[tb["year_start"] >= tb["year"].min()] + + return tb + + +def _prio_add_metrics(tb: Table) -> Table: + """Things to consider: + + Values for the `number_new_conflicts` in 1989 for conflict types 'one-sided' and 'non-state' (i.e. other than 'state-based') + are not accurate. + This is because the Geo-referenced dataset starts in 1989, and this leads somehow to an overestimate of the number of conflicts + that started this year. We can solve this for 'state-based' conflicts, for which we can get data earlier than 1989 from + the UCDP/PRIO Armed Conflicts dataset. + """ + # Get number of ongoing conflicts for all regions + cols_idx = ["year", "region", "conflict_type"] + tb_ongoing = tb.groupby(cols_idx, as_index=False)["conflict_id"].nunique() + tb_ongoing.columns = cols_idx + ["number_ongoing_conflicts"] + # Get number of ongoing conflicts for 'World' + cols_idx = ["year", "conflict_type"] + tb_ongoing_world = tb.groupby(cols_idx, as_index=False)["conflict_id"].nunique() + tb_ongoing_world.columns = cols_idx + ["number_ongoing_conflicts"] + tb_ongoing_world["region"] = "World" + # Combine regions & world + tb_ongoing = pr.concat([tb_ongoing, tb_ongoing_world], ignore_index=True) + # Keep only until 1989 + tb_ongoing = tb_ongoing[tb_ongoing["year"] < 1989] + + # Get number of new conflicts for all regions + ## Reduce table to only preserve first appearing event + tb = tb.sort_values("year").drop_duplicates(subset=["conflict_id", "year_start", "region"], keep="first") + # Groupby operation + cols_idx = ["year_start", "region", "conflict_type"] + tb_new = tb.groupby(cols_idx, as_index=False)["conflict_id"].nunique() + tb_new.columns = cols_idx + ["number_new_conflicts"] + # Get number of new conflicts for 'World' + tb = tb.sort_values("year").drop_duplicates(subset=["conflict_id", "year_start"], keep="first") + cols_idx = ["year_start", "conflict_type"] + tb_new_world = tb.groupby(cols_idx, as_index=False)["conflict_id"].nunique() + tb_new_world.columns = cols_idx + ["number_new_conflicts"] + tb_new_world["region"] = "World" + # Combine regions & world + tb_new = pr.concat([tb_new, tb_new_world], ignore_index=True) + # Keep only until 1989 (inc) + tb_new = tb_new[tb_new["year_start"] <= 1989] + # Rename column + tb_new = tb_new.rename(columns={"year_start": "year"}) + + # Combine and build single table + tb = tb_ongoing.merge( + tb_new, left_on=["year", "region", "conflict_type"], right_on=["year", "region", "conflict_type"], how="outer" + ) + + # Dtypes + tb = tb.astype({"year": "uint64", "region": "category"}) + + return tb + + +def add_conflict_all(tb: Table) -> Table: + """Add metrics for conflict_type = 'all'. + + Note that this should only be added for years after 1989, since prior to that year we are missing data on 'one-sided' and 'non-state'. + """ + # Estimate number of all conflicts + tb_all = tb.groupby(["year", "region"], as_index=False)[ + [ + "number_deaths_ongoing_conflicts", + "number_deaths_ongoing_conflicts_high", + "number_deaths_ongoing_conflicts_low", + "number_deaths_ongoing_conflicts_civilians", + "number_deaths_ongoing_conflicts_unknown", + "number_deaths_ongoing_conflicts_combatants", + "number_ongoing_conflicts", + "number_new_conflicts", + ] + ].sum() + tb_all["conflict_type"] = "all" + + # Only append values after 1989 (before that we don't have 'one-sided' or 'non-state' counts) + tb_all = tb_all[tb_all["year"] >= 1989] + tb = pr.concat([tb, tb_all], ignore_index=True) + + # Set `number_new_conflicts` to NaN for 1989 + tb.loc[(tb["year"] == 1989) & (tb["conflict_type"] == "all"), "number_new_conflicts"] = np.nan + + return tb + + +def add_conflict_all_intrastate(tb: Table) -> Table: + """Add metrics for conflict_type = 'intrastate'.""" + tb_intra = tb[ + tb["conflict_type"].isin(["intrastate (non-internationalized)", "intrastate (internationalized)"]) + ].copy() + tb_intra = tb_intra.groupby(["year", "region"], as_index=False).sum(numeric_only=True, min_count=1) + tb_intra["conflict_type"] = "intrastate" + tb = pr.concat([tb, tb_intra], ignore_index=True) + return tb + + +def add_conflict_all_statebased(tb: Table) -> Table: + """Add metrics for conflict_type = 'state-based'.""" + tb_state = tb[tb["conflict_type"].isin(TYPE_OF_CONFLICT_MAPPING.values())].copy() + tb_state = tb_state.groupby(["year", "region"], as_index=False).sum(numeric_only=True, min_count=1) + tb_state["conflict_type"] = "state-based" + tb = pr.concat([tb, tb_state], ignore_index=True) + return tb + + +def adapt_region_names(tb: Table) -> Table: + assert not tb["region"].isna().any(), "There were some NaN values found for field `region`. This is not expected!" + # Get regions in table + regions = set(tb["region"]) + # Check they are as expected + regions_unknown = regions - (REGIONS_EXPECTED | {"World"}) + assert not regions_unknown, f"Unexpected regions: {regions_unknown}, please review!" + + # Add suffix with source name + msk = tb["region"] != "World" + tb.loc[msk, "region"] = tb.loc[msk, "region"] + " (UCDP)" + return tb + + +def estimate_metrics_participants(tb: Table, tb_prio: Table, tb_codes: Table) -> Table: + """Add participant information at country-level.""" + ################### + # Participated in # + ################### + # FLAG YES/NO (country-level) + + # Get table with [year, conflict_type, code] + codes = ["gwnoa", "gwnob"] + tb_country = pr.concat( + [tb.loc[:, ["year", "conflict_type", code]].rename(columns={code: "id"}).copy() for code in codes] + ) + + # Drop rows with code = NaN + tb_country = tb_country.dropna(subset=["id"]) + # Drop duplicates + tb_country = tb_country.drop_duplicates() + + # Explode where multiple codes + tb_country["id"] = tb_country["id"].astype(str).str.split(";") + tb_country = tb_country.explode("id") + # Drop duplicates (may appear duplicates after exploding) + tb_country = tb_country.drop_duplicates() + # Ensure numeric type + tb_country["id"] = tb_country["id"].astype(int) + + # Sanity check + assert not tb_country.isna().any(axis=None), "There are some NaNs!" + + # Add country name + tb_country["country"] = tb_country.apply(lambda x: tb_codes.loc[(x["id"], x["year"])], axis=1) + assert tb_country["country"].notna().all(), "Some countries were not found! NaN was set" + + # Add flag + tb_country["participated_in_conflict"] = 1 + tb_country["participated_in_conflict"].m.origins = tb["gwnoa"].m.origins + + # Prepare GW table + ctypes_all = list(set(tb_country["conflict_type"])) + tb_alltypes = Table(pd.DataFrame({"conflict_type": ctypes_all})) + tb_codes_ = tb_codes.reset_index().merge(tb_alltypes, how="cross") + tb_codes_["country"] = tb_codes_["country"].astype(str) + + # Combine all GW entries with UCDP + columns_idx = ["year", "country", "id", "conflict_type"] + tb_country = tb_codes_.merge(tb_country, on=columns_idx, how="outer") + tb_country["participated_in_conflict"] = tb_country["participated_in_conflict"].fillna(0) + tb_country = tb_country[columns_idx + ["participated_in_conflict"]] + + # Add intrastate (all) + tb_country = aggregate_conflict_types( + tb_country, "intrastate", ["intrastate (non-internationalized)", "intrastate (internationalized)"] + ) + # Add state-based + tb_country = aggregate_conflict_types(tb_country, "state-based", list(TYPE_OF_CONFLICT_MAPPING.values())) + + # Only preserve years that make sense + tb_country = tb_country[(tb_country["year"] >= tb["year"].min()) & (tb_country["year"] <= tb["year"].max())] + + ################### + # Participated in # + ################### + # NUMBER COUNTRIES + + tb_num_participants = get_number_of_countries_in_conflict_by_region(tb_country, "conflict_type") + + # Combine tables + tb_country = pr.concat([tb_country, tb_num_participants], ignore_index=True) + + # Drop column `id` + tb_country = tb_country.drop(columns=["id"]) + + ############ + # Add PRIO # + ############ + tb_country_prio = estimate_metrics_participants_prio(tb_prio, tb_codes) + + tb_country = pr.concat([tb_country, tb_country_prio], ignore_index=True, short_name=f"{paths.short_name}_country") + + return tb_country + + +def estimate_metrics_participants_prio(tb_prio: Table, tb_codes: Table) -> Table: + """Add participant information at country-level. + + Only works for UCDP/PRIO data. + """ + ################### + # Participated in # + ################### + # FLAG YES/NO (country-level) + + # Get table with [year, conflict_type, code] + codes = ["gwno_a", "gwno_a_2nd", "gwno_b", "gwno_b_2nd"] + tb_country = pr.concat( + [tb_prio[["year", "type_of_conflict", code]].rename(columns={code: "id"}).copy() for code in codes] + ) + + # Drop rows with code = NaN + tb_country = tb_country.dropna(subset=["id"]) + # Drop duplicates + tb_country = tb_country.drop_duplicates() + + # Explode where multiple codes + tb_country["id"] = tb_country["id"].astype(str).str.split(",") + tb_country = tb_country.explode("id") + # Ensure numeric type + tb_country["id"] = tb_country["id"].astype(int) + # Drop duplicates (may appear duplicates after exploding) + tb_country = tb_country.drop_duplicates() + + # Sanity check + assert not tb_country.isna().any(axis=None), "There are some NaNs!" + + # Correct codes + ## 751 'Government of Hyderabad' -> 750 'India' + tb_country.loc[tb_country["id"] == 751, "id"] = 750 + ## 817 'Republic of Vietnam' in 1975 -> 816 'Vietnam' + tb_country.loc[(tb_country["id"] == 817) & (tb_country["year"] == 1975), "id"] = 816 + ## 345 'Yugoslavia' after 2005 -> 340 'Serbia' + tb_country.loc[(tb_country["id"] == 345) & (tb_country["year"] > 2005), "id"] = 340 + # Add country name + tb_country["country"] = tb_country.apply(lambda x: tb_codes.loc[(x["id"], x["year"])], axis=1) + assert tb_country["country"].notna().all(), "Some countries were not found! NaN was set" + ## Remove duplicates after correcting codes + tb_country = tb_country.drop_duplicates() + + # Add flag + tb_country["participated_in_conflict"] = 1 + tb_country["participated_in_conflict"].m.origins = tb_prio["gwno_a"].m.origins + + # Format conflict tyep + tb_country["conflict_type"] = tb_country["type_of_conflict"].astype(object).replace(TYPE_OF_CONFLICT_MAPPING) + tb_country = tb_country.drop(columns=["type_of_conflict"]) + + # Prepare GW table + tb_alltypes = Table(pd.DataFrame({"conflict_type": tb_country["conflict_type"].unique()})) + tb_codes = tb_codes.reset_index().merge(tb_alltypes, how="cross") + tb_codes["country"] = tb_codes["country"].astype(str) + + # Combine all GW entries with UCDP/PRIO + columns_idx = ["year", "country", "id", "conflict_type"] + tb_country = tb_codes.merge(tb_country, on=columns_idx, how="outer") + tb_country["participated_in_conflict"] = tb_country["participated_in_conflict"].fillna(0) + tb_country = tb_country[columns_idx + ["participated_in_conflict"]] + + # Add intrastate (all) + tb_country = aggregate_conflict_types( + tb_country, "intrastate", ["intrastate (non-internationalized)", "intrastate (internationalized)"] + ) + # Add state-based + tb_country = aggregate_conflict_types(tb_country, "state-based", list(TYPE_OF_CONFLICT_MAPPING.values())) + + # Only preserve years that make sense + tb_country = tb_country[ + (tb_country["year"] >= tb_prio["year"].min()) & (tb_country["year"] <= tb_prio["year"].max()) + ] + + ################### + # Participated in # + ################### + # NUMBER COUNTRIES + + tb_num_participants = get_number_of_countries_in_conflict_by_region(tb_country, "conflict_type") + + # Combine tables + tb_country = pr.concat([tb_country, tb_num_participants], ignore_index=True) + + # Drop column `id` + tb_country = tb_country.drop(columns=["id"]) + + ############### + # Final steps # + ############### + + # Keep only years not covered by UCDP (except for 'extrasystemic') + tb_country = tb_country[(tb_country["year"] < 1989) | (tb_country["conflict_type"] == "extrasystemic")] + return tb_country + + +def estimate_metrics_locations(tb: Table, tb_maps: Table, tb_codes: Table, ds_population: Dataset) -> Table: + """Add participant information at country-level. + + reference: https://github.com/owid/notebooks/blob/main/JoeHasell/UCDP%20and%20PRIO/UCDP_georeferenced/ucdp_country_extract.ipynb + + tb: actual data + tb_maps: map data (borders and stuff) + tb_codes: from gw codes. so that all countries have either a 1 or 0 (instead of missing data). + ds_population: population data (for rates) + """ + tb_codes_ = tb_codes.reset_index().drop(columns=["id"]).copy() + tb_codes_ = tb_codes_[tb_codes_["year"] >= 1989] + + # Add country name using geometry + paths.log.info("adding location name of conflict event...") + tb_locations = _get_location_of_conflict_in_ucdp_ged(tb, tb_maps).copy() + + # There are some countries not in GW (remove, replace?). We keep Palestine and Western Sahara since + # these are mappable in OWID maps. + # We map entry with id "53238" and relid "PAK-2003-1-345-88" from "Siachen Glacier" to "Pakistan" based on + # the text in `where_description` field, which says: "Giang sector in Siachen, Pakistani Kashmir" + tb_locations.loc[tb_locations["country_name_location"] == "Siachen Glacier", "country_name_location"] = "Pakistan" + + ################### + # COUNTRY-LEVEL: Country in conflict or not (1 or 0) + ################### + paths.log.info("estimating country flag 'is_location_of_conflict'...") + + # Check that number of deaths is all zero + assert ( + tb_locations["best"] - tb_locations[["deaths_a", "deaths_b", "deaths_civilians", "deaths_unknown"]].sum(axis=1) + == 0 + ).all(), "Sum of deaths from combatants, civilians and unknown should equal best estimate!" + tb_locations["deaths_combatants"] = tb_locations["deaths_a"] + tb_locations["deaths_b"] + + # Estimate if a conflict occured in a country, and the number of deaths in it + # Define aggregations + INDICATOR_BASE_NAME = "number_deaths" + column_props = { + # Deaths (estimates) + "best": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}", + }, + "high": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_high", + }, + "low": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_low", + }, + # Deaths by type + "deaths_civilians": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_civilians", + }, + "deaths_unknown": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_unknown", + }, + "deaths_combatants": { + "f": "sum", + "rename": f"{INDICATOR_BASE_NAME}_combatants", + }, + # Number of conflicts + "conflict_new_id": { + "f": "nunique", + "rename": "is_location_of_conflict", + }, + } + # TODO: continue here + col_funcs = {k: v["f"] for k, v in column_props.items()} + col_renames = {k: v["rename"] for k, v in column_props.items()} + tb_locations_country = ( + tb_locations.groupby(["country_name_location", "year", "conflict_type"], as_index=False) + .agg(col_funcs) + .rename( + columns={ + "country_name_location": "country", + } + | col_renames + ) + ) + assert tb_locations_country["is_location_of_conflict"].notna().all(), "Missing values in `is_location_of_conflict`!" + cols_num_deaths = [v for v in col_renames.values() if v != "is_location_of_conflict"] + for col in cols_num_deaths: + assert tb_locations_country[col].notna().all(), f"Missing values in `{col}`!" + # Convert into a binary indicator: 1 (if more than one conflict), 0 (otherwise) + tb_locations_country["is_location_of_conflict"] = tb_locations_country["is_location_of_conflict"].apply( + lambda x: 1 if x > 0 else 0 + ) + + # Add missing countries using tb_codes as reference + tb_locations_country = tb_codes_.merge( + tb_locations_country, + on=["country", "year"], + how="outer", + ) + # Add Greenland + assert ( + "Greenland" not in set(tb_locations_country.country) + ), "Greenland is not expected to be there! That's why we force it to zero. If it appears, just remove the following code line" + tb_green = Table(pd.DataFrame({"country": ["Greenland"], "year": [LAST_YEAR]})) + tb_locations_country = pr.concat([tb_locations_country, tb_green], ignore_index=True) + + # NaNs of numeric indicators to zero + cols_indicators = ["is_location_of_conflict"] + cols_num_deaths + tb_locations_country[cols_indicators] = tb_locations_country[cols_indicators].fillna(0) + # NaN in conflict_type to arbitrary (since missing ones are filled from the next operation with fill_gaps_with_zeroes) + mask = tb_locations_country["conflict_type"].isna() + assert ( + tb_locations_country.loc[mask, cols_indicators].sum().sum() == 0 + ), "There are some non-NaNs for NaN-valued conflict types!" + tb_locations_country["conflict_type"] = tb_locations_country["conflict_type"].fillna("one-sided violence") + + # Fill with zeroes + tb_locations_country = expand_time_column( + tb_locations_country, + dimension_col=["country", "conflict_type"], + time_col="year", + method="full_range", + fillna_method="zero", + ) + + # Add origins from Natural Earth + cols = ["is_location_of_conflict"] + cols_num_deaths + for col in cols: + tb_locations_country[col].origins += tb_maps["name"].m.origins + + ################### + # Add conflict type aggregates + ################### + paths.log.info("adding conflict type aggregates...") + + # Add missing conflict types + CTYPES_AGGREGATES = { + "intrastate": ["intrastate (non-internationalized)", "intrastate (internationalized)"], + "state-based": list(TYPE_OF_CONFLICT_MAPPING.values()), + "all": list(TYPE_OF_VIOLENCE_MAPPING.values()) + list(TYPE_OF_CONFLICT_MAPPING.values()), + } + for ctype_agg, ctypes in CTYPES_AGGREGATES.items(): + tb_locations_country = aggregate_conflict_types( + tb=tb_locations_country, + parent_name=ctype_agg, + children_names=ctypes, + columns_to_aggregate=["is_location_of_conflict"] + cols_num_deaths, + columns_to_aggregate_absolute=cols_num_deaths, + columns_to_groupby=["country", "year"], + ) + + ################### + # Add rates + ################### + # Add population column + tb_locations_country = geo.add_population_to_table( + tb=tb_locations_country, + ds_population=ds_population, + ) + # Divide and obtain rates + factor = 100_000 + suffix = [c.replace(INDICATOR_BASE_NAME, "") for c in cols_num_deaths] + suffix = [suf for suf in suffix if suf not in {"_combatants", "_unknown", "_civilians"}] + for suf in suffix: + tb_locations_country[f"death_rate{suf}"] = ( + factor * tb_locations_country[f"{INDICATOR_BASE_NAME}{suf}"] / tb_locations_country["population"] + ) + + # Drop population column + tb_locations_country = tb_locations_country.drop(columns=["population"]) + + ################### + # REGION-LEVEL: Number of locations with conflict + ################### + paths.log.info("estimating number of locations with conflict...") + + def _get_number_of_locations_with_conflict_regions(tb: Table, cols: List[str]) -> Table: + """Get number of locations with conflict.""" + # For each group, get the number of unique locations + tb = ( + tb.groupby(cols) + .agg( + { + "country_name_location": "nunique", + } + ) + .reset_index() + ) + # Rename columns + if "region" in cols: + column_rename = { + "country_name_location": "number_locations", + "region": "country", + } + else: + column_rename = { + "country_name_location": "number_locations", + } + + tb = tb.rename(columns=column_rename) + return tb + + # Regions + ## Number of countries (given ctypes) + tb_locations_regions = _get_number_of_locations_with_conflict_regions( + tb_locations, ["region", "year", "conflict_type"] + ) + tb_locations_regions_world = _get_number_of_locations_with_conflict_regions(tb_locations, ["year", "conflict_type"]) + tb_locations_regions_world["country"] = "World" + + tbs_locations_regions = [ + tb_locations_regions, + tb_locations_regions_world, + ] + + ## Extra conflict types (aggregates) + cols = ["region", "year"] + for ctype_agg, ctypes in CTYPES_AGGREGATES.items(): + # Keep only children for this ctype aggregate + tb_locations_ = tb_locations[tb_locations["conflict_type"].isin(ctypes)] + # Get actual table, add ctype. (also for region 'World') + tb_locations_regions_agg = _get_number_of_locations_with_conflict_regions(tb_locations_, ["region", "year"]) + tb_locations_regions_agg["conflict_type"] = ctype_agg + tb_locations_regions_agg_world = _get_number_of_locations_with_conflict_regions(tb_locations_, ["year"]) + tb_locations_regions_agg_world["conflict_type"] = ctype_agg + tb_locations_regions_agg_world["country"] = "World" + tbs_locations_regions.extend([tb_locations_regions_agg, tb_locations_regions_agg_world]) + + # Combine + tb_locations_regions = pr.concat( + tbs_locations_regions, + ignore_index=True, + ) + + # Add origins + tb_locations_regions["number_locations"].m.origins = tb_locations_country["is_location_of_conflict"].origins + + # Extend to full time-series + fill NaNs with zeros. + tb_locations_regions = expand_time_column( + df=tb_locations_regions, + dimension_col=["country", "conflict_type"], + time_col="year", + method="full_range", + fillna_method="zero", + ) + + ################### + # COMBINE: Country flag + Regional counts + ################### + paths.log.info("combining country flag and regional counts...") + tb_locations = pr.concat( + [tb_locations_country, tb_locations_regions], short_name=f"{paths.short_name}_locations", ignore_index=True + ) + return tb_locations + + +def _get_location_of_conflict_in_ucdp_ged(tb: Table, tb_maps: Table) -> Table: + """Add column with country name of the conflict.""" + # Convert the UCDP data to a GeoDataFrame (so it can be mapped and used in spatial analysis). + # The 'wkt.loads' function takes the coordinates in the 'geometry' column and ensures geopandas will use it to map the data. + gdf = tb[["relid", "geom_wkt"]] + gdf.rename(columns={"geom_wkt": "geometry"}, inplace=True) + gdf["geometry"] = gdf["geometry"].apply(wkt.loads) + gdf = gpd.GeoDataFrame(gdf, crs="epsg:4326") + + # Format the map to be a GeoDataFrame with a gemoetry column + gdf_maps = gpd.GeoDataFrame(tb_maps) + gdf_maps["geometry"] = gdf_maps["geometry"].apply(wkt.loads) + gdf_maps = gdf_maps.set_geometry("geometry") + gdf_maps.crs = "epsg:4326" + + # Use the overlay function to extract data from the world map that each point sits on top of. + gdf_match = gpd.overlay(gdf, gdf_maps, how="intersection") + # Events not assigned to any country + # There are 2100 points that are missed - likely because they are in the sea perhaps due to the conflict either happening at sea or at the coast and the coordinates are slightly inaccurate. + # I've soften the assertion, otherwise a bit of a pain! + assert gdf.shape[0] - gdf_match.shape[0] <= 2200, "Unexpected number of events without exact coordinate match!" + # DEBUG: Examine which are these unlabeled conflicts + # mask = ~tb["relid"].isin(gdf_match["relid"]) + # tb.loc[mask, ["relid", "year", "conflict_name", "side_a", "side_b", "best"]] + + # Get missing entries + ids_missing = set(gdf["relid"]) - set(gdf_match["relid"]) + gdf_missing = gdf.loc[gdf["relid"].isin(ids_missing)] + + # Reprojecting the points and the world into the World Equidistant Cylindrical Sphere projection. + wec_crs = "+proj=eqc +lat_ts=0 +lat_0=0 +lon_0=0 +x_0=0 +y_0=0 +a=6371007 +b=6371007 +units=m +no_defs" + gdf_missing_wec = gdf_missing.to_crs(wec_crs) + gdf_maps_wec = gdf_maps.to_crs(wec_crs) + # For these points we can find the nearest country using the distance function + polygon_near = [] + for _, row in gdf_missing_wec.iterrows(): + polygon_index = gdf_maps_wec.distance(row["geometry"]).sort_values().index[0] + ne_country_name = gdf_maps_wec["name"][polygon_index] + polygon_near.append(ne_country_name) + # Assign + gdf_missing["name"] = polygon_near + + # Combining and adding name to original table + COLUMN_COUNTRY_NAME = "country_name_location" + gdf_country_names = pr.concat([Table(gdf_match[["relid", "name"]]), Table(gdf_missing[["relid", "name"]])]) + tb = tb.merge(gdf_country_names, on="relid", how="left", validate="one_to_one").rename( + columns={"name": COLUMN_COUNTRY_NAME} + ) + assert tb[COLUMN_COUNTRY_NAME].notna().all(), "Some missing values found in `COLUMN_COUNTRY_NAME`" + + # SOME CORRECTIONS # + # To align with OWID borders we will rename the conflicts in Somaliland to Somalia and the conflicts in Morocco that were below 27.66727 latitude to Western Sahara. + ## Somaliland -> Somalia + mask = tb[COLUMN_COUNTRY_NAME] == "Somaliland" + paths.log.info(f"{len(tb.loc[mask, COLUMN_COUNTRY_NAME])} datapoints in Somaliland") + tb.loc[mask, COLUMN_COUNTRY_NAME] = "Somalia" + ## Morocco -> Western Sahara + mask = (tb[COLUMN_COUNTRY_NAME] == "Morocco") & (tb["latitude"] < 27.66727) + paths.log.info(f"{len(tb.loc[mask, COLUMN_COUNTRY_NAME])} datapoints in land contested by Morocco/W.Sahara") + tb.loc[mask, COLUMN_COUNTRY_NAME] = "Western Sahara" + + # Add a flag column for points likely to have inccorect corrdinates: + # a) points where coordiantes are (0 0), or points where latitude and longitude are exactly the same + tb["flag"] = "" + # Items are (mask, flag_message) + errors = [ + ( + tb["geom_wkt"] == "POINT (0 0)", + "coordinates (0 0)", + ), + (tb["latitude"] == tb["longitude"], "latitude = longitude"), + ] + for error in errors: + tb.loc[error[0], "flag"] = error[1] + tb.loc[mask, COLUMN_COUNTRY_NAME] = np.nan + + assert tb[COLUMN_COUNTRY_NAME].isna().sum() == 4, "4 missing values were expected! Found a different amount!" + tb = tb.dropna(subset=[COLUMN_COUNTRY_NAME]) + + return tb + + +def extend_latest_years(tb: Table) -> Table: + """Create table with each country present in a year.""" + + index = list(tb.index.names) + tb = tb.reset_index() + + # define mask for last year + mask = tb["year"] == LAST_YEAR_STABLE + + # Get year to extend to + current_year = datetime.now().year + + tb_all_years = Table(pd.RangeIndex(LAST_YEAR_STABLE + 1, current_year + 1), columns=["year"]) + tb_last = tb[mask].drop(columns="year").merge(tb_all_years, how="cross") + + tb = pr.concat([tb, tb_last], ignore_index=True, short_name="gleditsch_countries") + + tb = tb.set_index(index) + return tb diff --git a/etl/steps/data/grapher/war/2024-11-22/ucdp_monthly.py b/etl/steps/data/grapher/war/2024-11-22/ucdp_monthly.py new file mode 100644 index 00000000000..1e9c8aa2847 --- /dev/null +++ b/etl/steps/data/grapher/war/2024-11-22/ucdp_monthly.py @@ -0,0 +1,50 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("ucdp_monthly") + + # Read table from garden dataset. + tb = ds_garden["ucdp_monthly"] + + # Process data. + # + # Rename index column `region` to `country`. + tb = tb.reset_index().rename(columns={"region": "country"}) + # Remove suffixes in region names + tb["country"] = tb["country"].str.replace(r" \(.+\)", "", regex=True) + # Set index + tb = tb.set_index(["year", "country", "conflict_type"]) + + # Get country-level data + tb_participants = ds_garden["ucdp_monthly_country"] + tb_locations = ds_garden["ucdp_monthly_locations"] + + tables = [ + tb, + tb_participants, + tb_locations, + ] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Remove source description so that it doesn't get appended to the dataset description. + # ds_grapher.metadata.sources[0].description = "" + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py b/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py new file mode 100644 index 00000000000..ca40b2c4218 --- /dev/null +++ b/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py @@ -0,0 +1,36 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + paths.log.info("start") + + # + # Load inputs. + # + # Retrieve snapshot. + snap_10 = paths.load_snapshot(short_name="ucdp_ced_v24_0_10.csv") + snap_q3 = paths.load_snapshot(short_name="ucdp_ced_v24_01_24_09.csv") + + # Read as tables + tb_10 = snap_10.read_csv() + tb_q3 = snap_q3.read_csv() + + # Check shapes + + tb = tb_10.format("id") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata, check_variables_metadata=True) # type: ignore + + # Save changes in the new garden dataset. + ds_meadow.save() + + paths.log.info("end") From fc2ba8e8d402578eea84428fa8919a6b934398a0 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 22 Nov 2024 13:09:14 +0100 Subject: [PATCH 4/8] fix meadow --- .../data/meadow/war/2024-11-22/ucdp_ced.py | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py b/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py index ca40b2c4218..99c4813348c 100644 --- a/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py +++ b/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py @@ -1,5 +1,7 @@ """Load a snapshot and create a meadow dataset.""" +import owid.catalog.processing as pr + from etl.helpers import PathFinder, create_dataset # Get paths and naming conventions for current step. @@ -20,15 +22,38 @@ def run(dest_dir: str) -> None: tb_10 = snap_10.read_csv() tb_q3 = snap_q3.read_csv() - # Check shapes + # Remove spurious columns, sanity checks + if "#" in tb_10.columns: + tb_10 = tb_10.drop(columns=["#"]) + + assert (tb_10.columns == tb_q3.columns).all(), "Columns do not match between monthly and quarterly snapshots!" + + # Combine tables + tb = pr.concat([tb_q3, tb_10], ignore_index=True) + tb = tb.drop_duplicates() + + # Monthly data may have events that were already reported in the quarterly release. + # Idea: Check that this is the case, and safely remove duplicates from the quarterly release, since the monthly release is more up-to-date. + + ## Ensure that all duplicate IDs are indeed because of duplicates between monthly-quarterly + value_counts = tb["id"].value_counts() + assert set(value_counts.unique()) == {1, 2}, "IDs should appear once or twice, not more!" + ids_duplicated = list(value_counts[value_counts > 1].index) + assert len(ids_duplicated) == tb_10[tb_10["id"].isin(ids_duplicated)].shape[0], "All duplicated ID" + tb = tb.drop_duplicates(subset="id", keep="last") - tb = tb_10.format("id") + # Format table + tb = tb.format("id") # # Save outputs. # # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata, check_variables_metadata=True) # type: ignore + ds_meadow = create_dataset( + dest_dir, + tables=[tb], + check_variables_metadata=True, + ) # type: ignore # Save changes in the new garden dataset. ds_meadow.save() From c0439417d6451d04f8952f2cb2df20cc38879896 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 22 Nov 2024 13:29:33 +0100 Subject: [PATCH 5/8] rename short_name --- dag/war.yml | 6 +++--- .../{ucdp_monthly.meta.yml => ucdp_preview.meta.yml} | 8 +++----- .../war/2024-11-22/{ucdp_monthly.py => ucdp_preview.py} | 6 ++++-- .../war/2024-11-22/{ucdp_monthly.py => ucdp_preview.py} | 8 ++++---- 4 files changed, 14 insertions(+), 14 deletions(-) rename etl/steps/data/garden/war/2024-11-22/{ucdp_monthly.meta.yml => ucdp_preview.meta.yml} (99%) rename etl/steps/data/garden/war/2024-11-22/{ucdp_monthly.py => ucdp_preview.py} (99%) rename etl/steps/data/grapher/war/2024-11-22/{ucdp_monthly.py => ucdp_preview.py} (86%) diff --git a/dag/war.yml b/dag/war.yml index 052b00a47b2..b529bc0b6ce 100644 --- a/dag/war.yml +++ b/dag/war.yml @@ -63,14 +63,14 @@ steps: data://meadow/war/2024-11-22/ucdp_ced: - snapshot://war/2024-11-22/ucdp_ced_v24_0_10.csv - snapshot://war/2024-11-22/ucdp_ced_v24_01_24_09.csv - data://garden/war/2024-11-22/ucdp_monthly: + data://garden/war/2024-11-22/ucdp_preview: - data://meadow/war/2024-11-22/ucdp_ced - data://garden/demography/2024-07-15/population - data://garden/geography/2023-11-28/nat_earth_110 - data://meadow/war/2024-08-26/ucdp - data://garden/countries/2024-08-27/gleditsch - data://grapher/war/2024-11-22/ucdp_monthly: - - data://garden/war/2024-11-22/ucdp_monthly + data://grapher/war/2024-11-22/ucdp_preview: + - data://garden/war/2024-11-22/ucdp_preview # PRIO v3.1 data://meadow/war/2023-09-21/prio_v31: diff --git a/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.meta.yml b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml similarity index 99% rename from etl/steps/data/garden/war/2024-11-22/ucdp_monthly.meta.yml rename to etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml index da1b751d200..73cac11acc5 100644 --- a/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.meta.yml +++ b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml @@ -149,7 +149,7 @@ definitions: tables: # PARTICIPANT INDICATORS - ucdp_monthly_country: + ucdp_preview_country: common: presentation: attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) @@ -204,7 +204,7 @@ tables: <%- endif -%> # LOCATION INDICATORS - ucdp_monthly_locations: + ucdp_preview_locations: common: description_processing: |- {definitions.all.location_conflicts_method} @@ -358,7 +358,7 @@ tables: # {definitions.number_deaths_type.description_short_per_capita} # MAIN INDICATORS - ucdp_monthly: + ucdp_preview: common: presentation: grapher_config: @@ -475,7 +475,6 @@ tables: # display: # numDecimalPlaces: 1 - # number_deaths_ongoing_conflicts_unknown_per_capita: # title: Death rate from unknown type in ongoing conflicts # unit: deaths @@ -486,7 +485,6 @@ tables: # display: # numDecimalPlaces: 1 - ##################### # Ongoing conflicts # ##################### diff --git a/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.py b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py similarity index 99% rename from etl/steps/data/garden/war/2024-11-22/ucdp_monthly.py rename to etl/steps/data/garden/war/2024-11-22/ucdp_preview.py index 2cad268e730..8d3d8362ccd 100644 --- a/etl/steps/data/garden/war/2024-11-22/ucdp_monthly.py +++ b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py @@ -1273,9 +1273,11 @@ def _get_location_of_conflict_in_ucdp_ged(tb: Table, tb_maps: Table) -> Table: # Use the overlay function to extract data from the world map that each point sits on top of. gdf_match = gpd.overlay(gdf, gdf_maps, how="intersection") # Events not assigned to any country - # There are 2100 points that are missed - likely because they are in the sea perhaps due to the conflict either happening at sea or at the coast and the coordinates are slightly inaccurate. + # There are 2271 points that are missed - likely because they are in the sea perhaps due to the conflict either happening at sea or at the coast and the coordinates are slightly inaccurate. # I've soften the assertion, otherwise a bit of a pain! - assert gdf.shape[0] - gdf_match.shape[0] <= 2200, "Unexpected number of events without exact coordinate match!" + assert ( + diff := gdf.shape[0] - gdf_match.shape[0] + ) <= 2280, f"Unexpected number of events without exact coordinate match! {diff}" # DEBUG: Examine which are these unlabeled conflicts # mask = ~tb["relid"].isin(gdf_match["relid"]) # tb.loc[mask, ["relid", "year", "conflict_name", "side_a", "side_b", "best"]] diff --git a/etl/steps/data/grapher/war/2024-11-22/ucdp_monthly.py b/etl/steps/data/grapher/war/2024-11-22/ucdp_preview.py similarity index 86% rename from etl/steps/data/grapher/war/2024-11-22/ucdp_monthly.py rename to etl/steps/data/grapher/war/2024-11-22/ucdp_preview.py index 1e9c8aa2847..7fb6dfc6f26 100644 --- a/etl/steps/data/grapher/war/2024-11-22/ucdp_monthly.py +++ b/etl/steps/data/grapher/war/2024-11-22/ucdp_preview.py @@ -11,10 +11,10 @@ def run(dest_dir: str) -> None: # Load inputs. # # Load garden dataset. - ds_garden = paths.load_dataset("ucdp_monthly") + ds_garden = paths.load_dataset("ucdp_preview") # Read table from garden dataset. - tb = ds_garden["ucdp_monthly"] + tb = ds_garden["ucdp_preview"] # Process data. # @@ -26,8 +26,8 @@ def run(dest_dir: str) -> None: tb = tb.set_index(["year", "country", "conflict_type"]) # Get country-level data - tb_participants = ds_garden["ucdp_monthly_country"] - tb_locations = ds_garden["ucdp_monthly_locations"] + tb_participants = ds_garden["ucdp_preview_country"] + tb_locations = ds_garden["ucdp_preview_locations"] tables = [ tb, From 1002472b3aae750b8dc7a4d405d8a35cc623753e Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 22 Nov 2024 13:29:40 +0100 Subject: [PATCH 6/8] meadow --- etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py b/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py index 99c4813348c..7d7496f9cd9 100644 --- a/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py +++ b/etl/steps/data/meadow/war/2024-11-22/ucdp_ced.py @@ -43,7 +43,10 @@ def run(dest_dir: str) -> None: tb = tb.drop_duplicates(subset="id", keep="last") # Format table - tb = tb.format("id") + tb = tb.format( + "id", + short_name="ucdp_ced", + ) # # Save outputs. From c1b5769ebdfc55f89567f456d10454c8fb993cf8 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 22 Nov 2024 13:45:13 +0100 Subject: [PATCH 7/8] rename dataset, add clarification --- etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml index 73cac11acc5..024f22d4d27 100644 --- a/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml +++ b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.meta.yml @@ -557,9 +557,11 @@ tables: attribution: Uppsala Conflict Data Program and Peace Research Institute Oslo (2024) dataset: - title: UCDP, History of war (monthly) + title: UCDP, History of war (preview) description: |- - This dataset provides information on armed conflicts, using data from the UCDP Georeferenced Event Dataset (version 23.1), the UCDP/PRIO Armed Conflict Dataset (version 23.1), and the UCDP Battle-Related Deaths Dataset (version 23.1). + This dataset provides information on armed conflicts, using data from the UCDP Georeferenced Event Dataset (version 24.1), the UCDP/PRIO Armed Conflict Dataset (version 24.1), the UCDP Battle-Related Deaths Dataset (version 24.1), and the UCDP Candidate Event Dataset (version 24.X). + + Its difference with the main "UCDP, History of war" dataset is that it includes data up to the latest available, using preliminary data from the UCDP Candidate Event Dataset. We aggregate the UCDP Georeferenced Event Dataset up to the year and world (region) to identify all conflict deaths, non-state conflicts, and one-sided violence. From b5168d4ddfeb3d5201eafc71cc02087620a695e7 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Fri, 22 Nov 2024 15:01:46 +0100 Subject: [PATCH 8/8] possible fix to 'unknown state based' being ignored --- etl/steps/data/garden/war/2024-11-22/ucdp_preview.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py index 8d3d8362ccd..f57ab1c8ec8 100644 --- a/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py +++ b/etl/steps/data/garden/war/2024-11-22/ucdp_preview.py @@ -55,6 +55,7 @@ 2: "interstate", 3: "intrastate (non-internationalized)", 4: "intrastate (internationalized)", + 99: "state-based (unknown)", } # Regions mapping (for PRIO/UCDP dataset) REGIONS_MAPPING = { @@ -355,7 +356,7 @@ def add_conflict_type(tb_ged: Table, tb_conflict: Table) -> Table: tb_conflict: Table This is a secondary table, that we use to obtain the conflict types of the conflicts. """ - tb_conflict = tb_conflict[["conflict_id", "year", "type_of_conflict"]].drop_duplicates() + tb_conflict = tb_conflict.loc[:, ["conflict_id", "year", "type_of_conflict"]].drop_duplicates() assert tb_conflict.groupby(["conflict_id", "year"]).size().max() == 1, "Some conflict_id-year pairs are duplicated!" # Add `type_of_conflict` to `tb_ged`. @@ -363,10 +364,10 @@ def add_conflict_type(tb_ged: Table, tb_conflict: Table) -> Table: tb_ged = tb_ged.merge( tb_conflict, left_on=["conflict_new_id", "year"], right_on=["conflict_id", "year"], how="outer" ) - # Fill unknown types of violence + # Fill unknown types of violence (99: 'state-based (unknown)') mask = tb_ged["type_of_violence"] == 1 # these are state-based conflicts tb_ged["type_of_conflict"] = tb_ged["type_of_conflict"].astype(object) - tb_ged.loc[mask, "type_of_conflict"] = tb_ged.loc[mask, "type_of_conflict"].fillna("state-based (unknown)") + tb_ged.loc[mask, "type_of_conflict"] = tb_ged.loc[mask, "type_of_conflict"].fillna(99) # Assert that `type_of_conflict` was only added for state-based events assert ( @@ -671,7 +672,7 @@ def fix_extrasystemic_entries(tb: Table) -> Table: def _prepare_prio_table(tb: Table) -> Table: # Select relevant columns - tb = tb[["conflict_id", "year", "region", "type_of_conflict", "start_date"]] + tb = tb.loc[:, ["conflict_id", "year", "region", "type_of_conflict", "start_date"]] # Flatten (some entries have multiple regions, e.g. `1, 2`). This should be flattened to multiple rows. # https://stackoverflow.com/a/42168328/5056599