From 5547fda07a6c8a38349d99c5d9f5c2a6282a78d1 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Thu, 9 May 2024 19:38:34 +0200
Subject: [PATCH 01/23] snapshot

---
 .../democracy/2024-05-09/lexical_index.py     | 24 +++++++++++++
 .../2024-05-09/lexical_index.xlsx.dvc         | 35 +++++++++++++++++++
 2 files changed, 59 insertions(+)
 create mode 100644 snapshots/democracy/2024-05-09/lexical_index.py
 create mode 100644 snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc

diff --git a/snapshots/democracy/2024-05-09/lexical_index.py b/snapshots/democracy/2024-05-09/lexical_index.py
new file mode 100644
index 00000000000..1b653a4a598
--- /dev/null
+++ b/snapshots/democracy/2024-05-09/lexical_index.py
@@ -0,0 +1,24 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"democracy/{SNAPSHOT_VERSION}/lexical_index.xlsx")
+
+    # Download data from source, add file to DVC and upload to S3.
+    snap.create_snapshot(upload=upload)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc b/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc
new file mode 100644
index 00000000000..cd671301283
--- /dev/null
+++ b/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc
@@ -0,0 +1,35 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: Lexical Index of Electoral Democracy (LIED)
+    description: |-
+      LIED is the most comprehensive dataset on democracy in terms of country-years. It covers all
+      independent countries and most semi-sovereign polities and overseas colonies, protectorates, etc. within the 1789 to 2020 timespan. Scores have also been assigned to the units in the case of short-term foreign occupation. Scores for each indicator reflect the status of a country on the last day of the calendar year (31 December) and are not intended to reflect the mean value of an indicator across the previous 364 days. Coding decisions are based on country-specific sources. 
+
+      All original coding has been done by Svend-Erik Skaaning. Svend-Erik Skaaning has developed the conceptual distinctions and cumulative logic associated with the lexical index in collaboration with John Gerring. The distinctions regarding modes of democratic transition and breakdown have been developed by Svend-Erik Skaaning, 1 who has also developed the turnover variables. Henrikas Bartusevicius was in charge of empirical analyses and the coding linked to the inter-coder reliability test presented in the dataset paper (see below).
+
+      The dataset consists of 14 original indicators and two original indices. The LIED dataset offers indicators on whether legislative elections are on track (legislative_elections), whether (direct or indirect) executive elections are on track (executive_elections), whether multiple parties are able to run for legislative elections (multi-party_legislative_elections), whether there is universal male suffrage (male_suffrage), and whether there is universal female suffrage (female_suffrage),2 whether elections are genuinely contested (competitive_elections), whether political liberties in the form of freedom of expression, assembly, and association, are respected (political_liberties), whether countries experienced democratic transition in a given year (democratic_transition), the mode of democratic transition (transition_type), whether countries experienced democratic breakdown in a given year (democratic_breakdown), the mode of democratic breakdown (breakdown_type),whether elections led to a government turnover (turnover_event), and whether a period of competitive elections has been characterized by at least one government turnover (turnover_period). Finally, the data are used to construct two indices, i.e., the Lexical Index of Electoral Democracy (lexical_index) and an extended version called Lexical Index of Electoral Democracy+ (lexical_index_plus).
+    date_published: "2023-07-31"
+    version_producer: v6.5
+
+    # Citation
+    producer: Skaaning et al.
+    citation_full: |-
+      Skaaning, Svend-Erik, John Gerring and Henrikas Bartusevičius. 2015. A Lexical Index of Electoral Democracy. Comparative Political Studies 48(12):1491-1525.
+
+    # Files
+    url_main: https://ps.au.dk/en/research/research-projects/dedere/datasets
+    url_download: https://dataverse.harvard.edu/file.xhtml?fileId=7266277&version=3.0#
+    date_accessed: 2024-05-09
+
+    # License
+    license:
+      name: CC0
+      url: https://creativecommons.org/publicdomain/zero/1.0/
+
+outs:
+  - md5: 6f18955d84bb3a64316cce58c8bfc100
+    size: 96351
+    path: lexical_index.xlsx

From 404bb11387b875fa18029d28530136b12ebebb08 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Thu, 9 May 2024 19:58:15 +0200
Subject: [PATCH 02/23] wip

---
 dag/democracy.yml                             |  8 +++
 .../2024-05-09/lexical_index.countries.json   |  2 +
 .../2024-05-09/lexical_index.meta.yml         | 53 +++++++++++++++++++
 .../democracy/2024-05-09/lexical_index.py     | 37 +++++++++++++
 .../democracy/2024-05-09/lexical_index.py     | 32 +++++++++++
 .../democracy/2024-05-09/lexical_index.py     | 37 +++++++++++++
 .../democracy/2024-05-09/lexical_index.py     | 11 ++--
 .../2024-05-09/lexical_index.xlsx.dvc         |  7 ++-
 8 files changed, 180 insertions(+), 7 deletions(-)
 create mode 100644 etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json
 create mode 100644 etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
 create mode 100644 etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
 create mode 100644 etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py
 create mode 100644 etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py

diff --git a/dag/democracy.yml b/dag/democracy.yml
index 45380d019fd..5d535bf1196 100644
--- a/dag/democracy.yml
+++ b/dag/democracy.yml
@@ -28,3 +28,11 @@ steps:
     - data://garden/demography/2023-03-31/population
   data://grapher/democracy/2024-05-01/ert:
     - data://garden/democracy/2024-03-07/ert
+
+  # Lexcial Index (2023)
+  data://meadow/democracy/2024-05-09/lexical_index:
+    - snapshot://democracy/2024-05-09/lexical_index.xlsx
+  data://garden/democracy/2024-05-09/lexical_index:
+    - data://meadow/democracy/2024-05-09/lexical_index
+  data://grapher/democracy/2024-05-09/lexical_index:
+    - data://garden/democracy/2024-05-09/lexical_index
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json
new file mode 100644
index 00000000000..2c63c085104
--- /dev/null
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
new file mode 100644
index 00000000000..e197b0df667
--- /dev/null
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
@@ -0,0 +1,53 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Democracy
+
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+
+
+tables:
+  lexical_index:
+    variables:
+      # testing_variable:
+      #   title: Testing variable title
+      #   unit: arbitrary units
+      #   short_unit: au
+      #   description_short: Short description of testing variable.
+      #   description_processing: Description of processing of testing variable.
+      #   description_key: List of key points about the indicator.
+      #   description_from_producer: Description of testing variable from producer.
+      #   processing_level: minor
+      #   presentation:
+      #     attribution:
+      #     attribution_short:
+      #     faqs:
+      #     grapher_config:
+      #     title_public:
+      #     title_variant:
+      #     topic_tags:
+      #   display:
+      #     color:
+      #     conversionFactor: 1
+      #     description:
+      #     entityAnnotationsMap: Test annotation
+      #     includeInTable:
+      #     isProjection: false
+      #     name: Testing variable
+      #     numDecimalPlaces: 0
+      #     shortUnit: au
+      #     tableDisplay:
+      #       hideAbsoluteChange:
+      #       hideRelativeChange:
+      #     tolerance: 0
+      #     unit: arbitrary units
+      #     yearIsDay: false
+      #     zeroDay:
+      {}
+
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
new file mode 100644
index 00000000000..cb00bd48f40
--- /dev/null
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
@@ -0,0 +1,37 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("lexical_index")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["lexical_index"].reset_index()
+
+    #
+    # Process data.
+    #
+    tb = geo.harmonize_countries(
+        df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
+    )
+    tb = tb.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py b/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py
new file mode 100644
index 00000000000..d4eba37e868
--- /dev/null
+++ b/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py
@@ -0,0 +1,32 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("lexical_index")
+
+    # Read table from garden dataset.
+    tb = ds_garden["lexical_index"]
+
+    #
+    # Process data.
+    #
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py b/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py
new file mode 100644
index 00000000000..cb047dee3e2
--- /dev/null
+++ b/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py
@@ -0,0 +1,37 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("lexical_index.xlsx")
+
+    # Load data from snapshot.
+    tb = snap.read()
+
+    #
+    # Process data.
+    #
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb = tb.rename(columns={"countryn": "country"})
+    tb = tb.format(["country", "year"])
+
+    # Dtype
+    tb.loc[tb["lexical_index"] == ",", "lexical_index"] = float("nan")
+    tb["lexical_index"] = tb["lexical_index"].astype("Int64")
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/snapshots/democracy/2024-05-09/lexical_index.py b/snapshots/democracy/2024-05-09/lexical_index.py
index 1b653a4a598..cf335448a64 100644
--- a/snapshots/democracy/2024-05-09/lexical_index.py
+++ b/snapshots/democracy/2024-05-09/lexical_index.py
@@ -1,4 +1,8 @@
-"""Script to create a snapshot of dataset."""
+"""Script to create a snapshot of dataset.
+
+To download, visit https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/WPKNIT&version=3.0, and download LIED_6.5.xlsx file.
+
+NOTE: in case this site, please look for an alternative from the provider's main site: https://ps.au.dk/en/research/research-projects/dedere/datasets (also listed in the metadata)"""
 
 from pathlib import Path
 
@@ -12,12 +16,13 @@
 
 @click.command()
 @click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
-def main(upload: bool) -> None:
+@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.")
+def main(path_to_file: str, upload: bool) -> None:
     # Create a new snapshot.
     snap = Snapshot(f"democracy/{SNAPSHOT_VERSION}/lexical_index.xlsx")
 
     # Download data from source, add file to DVC and upload to S3.
-    snap.create_snapshot(upload=upload)
+    snap.create_snapshot(filename=path_to_file, upload=upload)
 
 
 if __name__ == "__main__":
diff --git a/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc b/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc
index cd671301283..171919f4bfc 100644
--- a/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc
+++ b/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc
@@ -6,7 +6,7 @@ meta:
     title: Lexical Index of Electoral Democracy (LIED)
     description: |-
       LIED is the most comprehensive dataset on democracy in terms of country-years. It covers all
-      independent countries and most semi-sovereign polities and overseas colonies, protectorates, etc. within the 1789 to 2020 timespan. Scores have also been assigned to the units in the case of short-term foreign occupation. Scores for each indicator reflect the status of a country on the last day of the calendar year (31 December) and are not intended to reflect the mean value of an indicator across the previous 364 days. Coding decisions are based on country-specific sources. 
+      independent countries and most semi-sovereign polities and overseas colonies, protectorates, etc. within the 1789 to 2020 timespan. Scores have also been assigned to the units in the case of short-term foreign occupation. Scores for each indicator reflect the status of a country on the last day of the calendar year (31 December) and are not intended to reflect the mean value of an indicator across the previous 364 days. Coding decisions are based on country-specific sources.
 
       All original coding has been done by Svend-Erik Skaaning. Svend-Erik Skaaning has developed the conceptual distinctions and cumulative logic associated with the lexical index in collaboration with John Gerring. The distinctions regarding modes of democratic transition and breakdown have been developed by Svend-Erik Skaaning, 1 who has also developed the turnover variables. Henrikas Bartusevicius was in charge of empirical analyses and the coding linked to the inter-coder reliability test presented in the dataset paper (see below).
 
@@ -21,7 +21,6 @@ meta:
 
     # Files
     url_main: https://ps.au.dk/en/research/research-projects/dedere/datasets
-    url_download: https://dataverse.harvard.edu/file.xhtml?fileId=7266277&version=3.0#
     date_accessed: 2024-05-09
 
     # License
@@ -30,6 +29,6 @@ meta:
       url: https://creativecommons.org/publicdomain/zero/1.0/
 
 outs:
-  - md5: 6f18955d84bb3a64316cce58c8bfc100
-    size: 96351
+  - md5: 3956d28a242f0e9e4f3e789bfdf605fc
+    size: 2293527
     path: lexical_index.xlsx

From 3d061cda0b0c57d188707dc81f3985293570a372 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Thu, 9 May 2024 19:58:43 +0200
Subject: [PATCH 03/23] avoid notebook creation

---
 apps/wizard/etl_steps/express.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/apps/wizard/etl_steps/express.py b/apps/wizard/etl_steps/express.py
index 179cc3f82b5..2b102d3ddab 100644
--- a/apps/wizard/etl_steps/express.py
+++ b/apps/wizard/etl_steps/express.py
@@ -360,6 +360,12 @@ def export_metadata() -> None:
         st.success(f"Metadata exported to `{output_path}`.")
 
 
+def remove_notebook(dataset_dir):
+    notebook_path = dataset_dir / "playground.ipynb"
+    if notebook_path.is_file():
+        os.remove(notebook_path)
+
+
 #########################################################
 # MAIN ##################################################
 #########################################################
@@ -533,6 +539,7 @@ def export_metadata() -> None:
                 "channel": "meadow",
             }
         )
+        remove_notebook(DATASET_DIR)
 
         #######################
         # GARDEN ##############
@@ -567,6 +574,7 @@ def export_metadata() -> None:
                 "channel": "garden",
             }
         )
+        remove_notebook(DATASET_DIR)
 
         #######################
         # GRAPHER #############
@@ -581,8 +589,11 @@ def export_metadata() -> None:
                 "channel": "grapher",
             }
         )
+        remove_notebook(DATASET_DIR)
 
-        # Add to DAG
+        #######################
+        # DAG #################
+        #######################
         dag_path = DAG_DIR / form.dag_file
         if form.add_to_dag:
             dag_content = add_to_dag(

From 84c22b178cfce02e1d33f3b418f1f10a051fa965 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Thu, 9 May 2024 20:46:25 +0200
Subject: [PATCH 04/23] clean step + metadata

---
 .../2024-05-09/lexical_index.countries.json   | 226 +++++++++++++++++-
 .../2024-05-09/lexical_index.meta.yml         | 160 ++++++++++---
 .../democracy/2024-05-09/lexical_index.py     |  54 ++++-
 3 files changed, 402 insertions(+), 38 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json
index 2c63c085104..d76660c9b38 100644
--- a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json
@@ -1,2 +1,226 @@
 {
-}
+  "Afghanistan": "Afghanistan",
+  "Albania": "Albania",
+  "Algeria": "Algeria",
+  "Andorra": "Andorra",
+  "Angola": "Angola",
+  "Antigua and Barbuda": "Antigua and Barbuda",
+  "Argentina": "Argentina",
+  "Armenia": "Armenia",
+  "Australia": "Australia",
+  "Austria": "Austria",
+  "Austria-Hungary": "Austria-Hungary",
+  "Azerbaijan": "Azerbaijan",
+  "Baden": "Grand Duchy of Baden",
+  "Bahamas": "Bahamas",
+  "Bahrain": "Bahrain",
+  "Bangladesh": "Bangladesh",
+  "Barbados": "Barbados",
+  "Bavaria": "Kingdom of Bavaria",
+  "Belarus": "Belarus",
+  "Belgium": "Belgium",
+  "Belize": "Belize",
+  "Benin": "Benin",
+  "Bhutan": "Bhutan",
+  "Bolivia": "Bolivia",
+  "Bosnia and Herzegovina": "Bosnia and Herzegovina",
+  "Botswana": "Botswana",
+  "Brazil": "Brazil",
+  "Brunei": "Brunei",
+  "Bulgaria": "Bulgaria",
+  "Burkina Faso": "Burkina Faso",
+  "Burundi": "Burundi",
+  "Cambodia": "Cambodia",
+  "Cameroon": "Cameroon",
+  "Canada": "Canada",
+  "Cape Verde": "Cape Verde",
+  "Central African Republic": "Central African Republic",
+  "Chad": "Chad",
+  "Chile": "Chile",
+  "China": "China",
+  "Colombia": "Colombia",
+  "Comoros": "Comoros",
+  "Costa Rica": "Costa Rica",
+  "Cote d'Ivoire": "Cote d'Ivoire",
+  "Croatia": "Croatia",
+  "Cuba": "Cuba",
+  "Cyprus": "Cyprus",
+  "Czech Republic": "Czechia",
+  "Czechoslovakia": "Czechoslovakia",
+  "Denmark": "Denmark",
+  "Djibouti": "Djibouti",
+  "Dominica": "Dominica",
+  "Dominican Republic": "Dominican Republic",
+  "East Timor": "East Timor",
+  "Ecuador": "Ecuador",
+  "Egypt": "Egypt",
+  "El Salvador": "El Salvador",
+  "Equatorial Guinea": "Equatorial Guinea",
+  "Eritrea": "Eritrea",
+  "Estonia": "Estonia",
+  "Ethiopia": "Ethiopia",
+  "Fiji": "Fiji",
+  "Finland": "Finland",
+  "France": "France",
+  "Gabon": "Gabon",
+  "Gambia": "Gambia",
+  "Georgia": "Georgia",
+  "Germany": "Germany",
+  "Germany, East": "East Germany",
+  "Germany, West": "West Germany",
+  "Ghana": "Ghana",
+  "Greece": "Greece",
+  "Grenada": "Grenada",
+  "Guatemala": "Guatemala",
+  "Guinea": "Guinea",
+  "Guinea-Bissau": "Guinea-Bissau",
+  "Guyana": "Guyana",
+  "Haiti": "Haiti",
+  "Honduras": "Honduras",
+  "Hungary": "Hungary",
+  "Iceland": "Iceland",
+  "India": "India",
+  "Indonesia": "Indonesia",
+  "Iran": "Iran",
+  "Iraq": "Iraq",
+  "Ireland": "Ireland",
+  "Israel": "Israel",
+  "Italy": "Italy",
+  "Jamaica": "Jamaica",
+  "Japan": "Japan",
+  "Jordan": "Jordan",
+  "Kazakhstan": "Kazakhstan",
+  "Kenya": "Kenya",
+  "Kiribati": "Kiribati",
+  "Kosovo": "Kosovo",
+  "Kuwait": "Kuwait",
+  "Kyrgyzstan": "Kyrgyzstan",
+  "Laos": "Laos",
+  "Latvia": "Latvia",
+  "Lebanon": "Lebanon",
+  "Lesotho": "Lesotho",
+  "Liberia": "Liberia",
+  "Libya": "Libya",
+  "Liechtenstein": "Liechtenstein",
+  "Lithuania": "Lithuania",
+  "Luxembourg": "Luxembourg",
+  "Macedonia": "North Macedonia",
+  "Madagascar": "Madagascar",
+  "Malawi": "Malawi",
+  "Malaysia": "Malaysia",
+  "Maldives": "Maldives",
+  "Mali": "Mali",
+  "Malta": "Malta",
+  "Marshall Islands": "Marshall Islands",
+  "Mauritania": "Mauritania",
+  "Mauritius": "Mauritius",
+  "Mexico": "Mexico",
+  "Micronesia": "Micronesia (country)",
+  "Modena": "Duchy of Modena and Reggio",
+  "Moldova": "Moldova",
+  "Monaco": "Monaco",
+  "Mongolia": "Mongolia",
+  "Montenegro": "Montenegro",
+  "Morocco": "Morocco",
+  "Mozambique": "Mozambique",
+  "Myanmar": "Myanmar",
+  "Namibia": "Namibia",
+  "Nauru": "Nauru",
+  "Nepal": "Nepal",
+  "Netherlands": "Netherlands",
+  "New Zealand": "New Zealand",
+  "Nicaragua": "Nicaragua",
+  "Niger": "Niger",
+  "Nigeria": "Nigeria",
+  "Norway": "Norway",
+  "Oman": "Oman",
+  "Orange Free State": "Orange Free State",
+  "Pakistan": "Pakistan",
+  "Palau": "Palau",
+  "Panama": "Panama",
+  "Papua New Guinea": "Papua New Guinea",
+  "Paraguay": "Paraguay",
+  "Parma": "Duchy of Parma and Piacenza",
+  "Peru": "Peru",
+  "Philippines": "Philippines",
+  "Poland": "Poland",
+  "Portugal": "Portugal",
+  "Qatar": "Qatar",
+  "Romania": "Romania",
+  "Russia": "Russia",
+  "Rwanda": "Rwanda",
+  "Samoa": "Samoa",
+  "San Marino": "San Marino",
+  "Sao Tome and Principe": "Sao Tome and Principe",
+  "Sardinia": "Kingdom of Sardinia",
+  "Saudi Arabia": "Saudi Arabia",
+  "Saxony": "Kingdom of Saxony",
+  "Senegal": "Senegal",
+  "Serbia": "Serbia",
+  "Seychelles": "Seychelles",
+  "Sicily": "Kingdom of the Two Sicilies",
+  "Sierra Leone": "Sierra Leone",
+  "Singapore": "Singapore",
+  "Slovakia": "Slovakia",
+  "Slovenia": "Slovenia",
+  "Solomon Islands": "Solomon Islands",
+  "Somalia": "Somalia",
+  "Somaliland": "Somaliland",
+  "South Africa": "South Africa",
+  "South Sudan": "South Sudan",
+  "Spain": "Spain",
+  "Sri Lanka": "Sri Lanka",
+  "St. Kitts and Nevis": "Saint Kitts and Nevis",
+  "St. Lucia": "Saint Lucia",
+  "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines",
+  "Sudan": "Sudan",
+  "Suriname": "Suriname",
+  "Swaziland": "Eswatini",
+  "Sweden": "Sweden",
+  "Switzerland": "Switzerland",
+  "Syria": "Syria",
+  "Taiwan": "Taiwan",
+  "Tajikistan": "Tajikistan",
+  "Tanzania": "Tanzania",
+  "Thailand": "Thailand",
+  "Togo": "Togo",
+  "Tonga": "Tonga",
+  "Trinidad and Tobago": "Trinidad and Tobago",
+  "Tunisia": "Tunisia",
+  "Turkey": "Turkey",
+  "Turkmenistan": "Turkmenistan",
+  "Tuscany": "Grand Duchy of Tuscany",
+  "Tuvalu": "Tuvalu",
+  "USSR": "USSR",
+  "Uganda": "Uganda",
+  "Ukraine": "Ukraine",
+  "United Arab Emirates": "United Arab Emirates",
+  "United Kingdom": "United Kingdom",
+  "United States": "United States",
+  "Uruguay": "Uruguay",
+  "Uzbekistan": "Uzbekistan",
+  "Vanuatu": "Vanuatu",
+  "Venezuela": "Venezuela",
+  "Vietnam": "Vietnam",
+  "Wuerttemberg": "Kingdom of Wurttemberg",
+  "Yemen": "Yemen",
+  "Yugoslavia": "Yugoslavia",
+  "Zambia": "Zambia",
+  "Zanzibar": "Zanzibar",
+  "Zimbabwe": "Zimbabwe",
+  "Congo Brazzaville": "Congo",
+  "Congo, Democratic Republic": "Democratic Republic of Congo",
+  "Gran Colombia": "Great Colombia",
+  "Korea": "Korea (former)",
+  "Korea, North": "North Korea",
+  "Korea, South": "South Korea",
+  "Mecklenburg-Schwerin": "Mecklenburg Schwerin",
+  "Palestine/British Mandate": "Palestine",
+  "Papal states, the": "Vatican",
+  "Sahrawi": "Western Sahara",
+  "Serbia-Montenegro": "Serbia and Montenegro",
+  "Vietnam, North": "Democratic Republic of Vietnam",
+  "Vietnam, South": "Republic of Vietnam",
+  "Yemen, North": "Yemen Arab Republic",
+  "Yemen, South": "Yemen People's Republic"
+}
\ No newline at end of file
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
index e197b0df667..82cac9e820f 100644
--- a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
@@ -11,43 +11,131 @@ definitions:
 dataset:
   update_period_days: 365
 
-
 tables:
   lexical_index:
     variables:
-      # testing_variable:
-      #   title: Testing variable title
-      #   unit: arbitrary units
-      #   short_unit: au
-      #   description_short: Short description of testing variable.
-      #   description_processing: Description of processing of testing variable.
-      #   description_key: List of key points about the indicator.
-      #   description_from_producer: Description of testing variable from producer.
-      #   processing_level: minor
-      #   presentation:
-      #     attribution:
-      #     attribution_short:
-      #     faqs:
-      #     grapher_config:
-      #     title_public:
-      #     title_variant:
-      #     topic_tags:
-      #   display:
-      #     color:
-      #     conversionFactor: 1
-      #     description:
-      #     entityAnnotationsMap: Test annotation
-      #     includeInTable:
-      #     isProjection: false
-      #     name: Testing variable
-      #     numDecimalPlaces: 0
-      #     shortUnit: au
-      #     tableDisplay:
-      #       hideAbsoluteChange:
-      #       hideRelativeChange:
-      #     tolerance: 0
-      #     unit: arbitrary units
-      #     yearIsDay: false
-      #     zeroDay:
-      {}
+      exelec_lied:
+        title: Elections for chief executive
+        unit: ''
+        description_short: |-
+          The variable indicates whether some citizens directly or indirectly elect the chief executive. It considers political system which do not govern themselves — such as due to external interventions, occupations, or colonization — as not holding executive elections.
+        description_from_producer: |-
+          Indicates whether the chief executive is either directly or indirectly elected (i.e., chosen by people who have been elected). This indicator takes into account whether executive power is responsible to an elected parliament if the executive is not directly elected, a situation generated by a series of historical and contemporary monarchies and principalities. Episodes of international supervision or domination following international interventions, occupation, or colonization, meaning that the polity does practice exercise self-government, are also understood as disqualifying. 1=present, 0=absent.
+
+          Equivalent indicator: `executive_elections`
+
+      legelec_lied:
+        title: Legislative elections
+        unit: ''
+        description_short: |-
+          The variable indicates whether some citizens elect a legislature which does issue some laws, but does not perform executive functions.
+        description_from_producer: |-
+          Indicates whether a legislative body, a parliament, issues at least some laws and does not perform executive functions. The lower house (or unicameral chamber) of the legislature is at least partly elected. The legislature has not been closed. 1=present, 0=absent.
+
+          Equivalent indicator: `legislative_elections`
+
+      opposition_lied:
+        title: Political opposition
+        unit: ''
+        description_short: |-
+          Indicates whether more than one party or non-party candidate are able to compete in elections for the legislature.
+        description_from_producer: |-
+          Indicates whether the lower house (or unicameral chamber) of the legislature is (at least in part) elected by voters facing more than one choice. Specifically, parties are not banned and (a) more than one party, including opposition parties, are allowed to compete or (b) candidates run without party labels but represent distinct political positions. 1=present, 0=absent.
+
+          Equivalent indicator: `multiparty_legislative_election`
+
+      competition_lied:
+        title: Competitive elections
+        unit: ''
+        description_short: |-
+          Whether the outcomes of elections are uncertain because their timing is not violated, voters are not systematically coerced, and election fraud is not consequential.
+        description_key:
+          - It considers the incumbent changing after multi-party elections as a strong indicator, but neither necessary nor sufficient.
+          - It does not consider whether all contestants have access to funding and the media, and media coverage is unbiased.
+        description_from_producer: |-
+          The chief executive offices and seats in the effective legislative body are filled by elections characterized by uncertainty, meaning that the elections are, in principle, sufficiently free to enable the opposition to gain power if they were to attract sufficient support from the electorate. This presumes that control over key executive and legislative offices is determined by elections, the executive and members of the legislature have not been unconstitutionally removed, and the legislature has not been dissolved. With respect to the electoral process, this presumes that the constitutional timing of elections has not been violated (in a more than marginal fashion), non- extremist parties are not banned, opposition candidates are generally free to participate, voters experience little systematic coercion in exercising their electoral choice, and electoral fraud does not determine who wins. With respect to the outcome, this presumes that the declared winner of executive and legislative elections reflects the votes cast by the electorate, as near as can be determined from extant sources. Incumbent turnover (as a result of multi-party elections) is regarded as a strong indicator of competition, but is neither necessary nor sufficient. In addition, we rely on reports from outside observers (as reported in books, articles, and country reports) about whether the foregoing conditions have been met in a given election. Coding for this variable does not take into account whether there is a level playing field, whether all contestants gain access to funding and media, whether media coverage is unbiased, whether civil liberties are respected, or other features associated with fully free and fair elections. 1=present, 0=absent.
+
+          Equivalent indicator: `competitive_elections`
+
+      poliberties_lied:
+        title: Political liberties
+        unit: ''
+        description_short: Indicates whether the freedoms of expression, assembly, and association are respected.
+        description_from_producer: |-
+          Freedom of expression, freedom of assembly, and freedom of association are respected. All groups, which are not openly anti-democratic, are allowed to organize freely and to assemble peacefully, and free speech, including critique of government and state-authorities, is tolerated and practiced freely by individuals and groups, including private as well as public media outlets. 1=present, 0=absent.
+
+          Equivalent indicator: ``
+
+      regime_redux_lied:
+        title: Political regime (reduced)
+        unit: ''
+        description_short: TODO
+        description_from_producer: |-
+          We operationalize electoral democracy as a series of necessary-and-sufficient conditions arrayed in an ordinal scale. The resulting Lexical Index of Electoral Democracy (LIED). In this fashion, we arrive at an index that performs a classificatory function, each level identifies a unique and theoretically meaningful regime type, as well as a discriminating function. To generate the lexical index from the six binary variables described above, a country-year is assigned scores (0 to 6) based on the following criteria:
+
+          0: legislative_election=0 & executive_elections=0 (regime type: non-electoral autocracies)
+
+          1: legislative_elections=1 or executive_elections=1 & multi-party_legislative_elections=0 (regime type: one-party autocracies, few cases where executive elections are on track but there is no functioning elected parliament)
+
+          2: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=0 (regime type: multiparty autocracies without elected executive – generally because a monarch influences government appointment and removal or foreign powers dominate political decision-making or has significant veto powers)
+
+          3: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=0 (regime type: multiparty autocracies)
+
+          4: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=0 (regime type: exclusive democracies)
+
+          5: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=1 & female_suffrage=0 (regime type: male democracies)
+
+          6: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=1 & female_suffrage=1 (regime type: electoral democracies)
+
+          Equivalent indicator: `lexical_index`
+
+      regime_lied:
+        title: Political regime
+        unit: ''
+        description_short: |-
+          The variable identifies the political regime of a country using the Lexical Index of Electoral Democracy by political scientists Svend-Erik Skaaning, John Gerring, and Henrikas Bartusevičius. It distinguishes between non-electoral autocracies (score 0), one-party autocracies (score 1), multi-party autocracies without elected executive (score 2), multi-party autocracies (score 3), exclusive democracies (score 4), male democracies (score 5), electoral democracies (score 6), and polyarchies (score 7).
+        description_key:
+          - In non-electoral autocracies, citizens do not have the right to elect the chief executive or the legislature.
+          - In one-party autocracies, some citizens have the right to choose the chief executive or the legislature, but only have one choice.
+          - Multiparty autocracies without an elected executive are otherwise one-party autocracies, but the chief executive of the government is not elected even if citizens have more than one choice in legislative elections.
+          - Multiparty autocracies are one-party autocracies in which citizens have more than one choice, though election outcome is certain.
+          - In exclusive democracies, citizens have the right to choose the chief executive and the legislature in multi-party, uncertain elections, but suffrage is restricted.
+          - Male democracies are exclusive democracies that have comprehensive suffrage for men.
+          - Electoral democracies are male democracies that also have comprehensive suffrage for women.
+          - Polyarchies are electoral democracies that also protect the freedoms of expression, assembly, and association.
+        description_from_producer: |-
+          This index, LIED+, add an extra layer to the upper-end of LIED in the form of political liberties. This is done to distinguish between electoral democracies and polyarchies. The meaning of the scores from 0 to 5 are identical to LIED, whereas 6 and 7 refer to the following configurations of indicator values:
+
+          6: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=1 & female_suffrage=1 & political_liberties=0 (regime type: electoral democracies)
+
+          7: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=1 & female_suffrage=1 & political_liberties=1 (regime type: polyarchies)
+
+          Equivalent indicator: `lexical_index_plus`
+        description_processing: |-
+          0: non-electoral autocracy
+          1: one-party autocracy
+          2: multi-party autocracy wihtout elected executive
+          3: multi-party autocracy
+          4: exclusive Democracy
+          5: male democracy
+          6: elecotral democracy
+          7: polyarchy
+
+      male_suffrage_lied:
+        title: Universal suffrage for men
+        unit: ''
+        description_short: TODO
+        description_from_producer: |-
+          Indicates whether virtually all male citizens are allowed to vote in national elections. Legal restrictions pertaining to age, criminal conviction, incompetence, and local residency are not considered. Informal restrictions such as those obtaining in the American South prior to 1965 are also not considered. 1=present, 0=absent.
+
+          Equivalent indicator: `male_suffrage`
+
+      female_suffrage_lied:
+        title: Universal suffrage for women
+        unit: ''
+        description_short: TODO
+        description_from_producer: |-
+          Indicates whether virtually all female citizens are allowed to vote in national elections. Similar coding rules apply. 1=present, 0=absent.
+
+          Equivalent indicator: `female_suffrage`
 
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
index cb00bd48f40..35320d8ab02 100644
--- a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
@@ -1,5 +1,7 @@
 """Load a meadow dataset and create a garden dataset."""
 
+from owid.catalog import Table
+
 from etl.data_helpers import geo
 from etl.helpers import PathFinder, create_dataset
 
@@ -21,8 +23,40 @@ def run(dest_dir: str) -> None:
     # Process data.
     #
     tb = geo.harmonize_countries(
-        df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
+        df=tb,
+        countries_file=paths.country_mapping_path,
     )
+    # Rename columns of interest
+    tb = rename_columns(tb)
+
+    # HOTFIX 2 -> 1 encoding
+    countries_miss_encoded = set(tb.loc[(tb["opposition_lied"] == 2) | (tb["legelec_lied"] == 2), "country"])
+    assert countries_miss_encoded == {"Botswana"}
+    tb.loc[tb["opposition_lied"] == 2, "opposition_lied"] = 1
+    tb.loc[tb["legelec_lied"] == 2, "legelec_lied"] = 1
+
+    # HOTFIX: if regime_lied is 7, then regime_redux_lied should be 6
+    # There is an error in Argentina@2022
+    tb.loc[(tb["regime_lied"] == 7), "regime_redux_lied"] = 6
+
+    # Select relevant columns
+    tb = tb[
+        [
+            "country",
+            "year",
+            "regime_lied",
+            "regime_redux_lied",
+            "exelec_lied",
+            "legelec_lied",
+            "opposition_lied",
+            "competition_lied",
+            "male_suffrage_lied",
+            "female_suffrage_lied",
+            "poliberties_liead",
+        ]
+    ]
+
+    # Format
     tb = tb.format(["country", "year"])
 
     #
@@ -35,3 +69,21 @@ def run(dest_dir: str) -> None:
 
     # Save changes in the new garden dataset.
     ds_garden.save()
+
+
+def rename_columns(tb: Table) -> Table:
+    """Rename variables of interest."""
+    tb = tb.rename(
+        columns={
+            "executive_elections": "exelec_lied",
+            "legislative_elections": "legelec_lied",
+            "multi_party_legislative_elections": "opposition_lied",
+            "competitive_elections": "competition_lied",
+            "political_liberties": "poliberties_lied",
+            "lexical_index": "regime_redux_lied",
+            "lexical_index_plus": "regime_lied",
+            "male_suffrage": "male_suffrage_lied",
+            "female_suffrage": "female_suffrage_lied",
+        }
+    )
+    return tb

From 1c56bf8201f310a43b7aad2a9898cc50a4d47ba7 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Thu, 9 May 2024 20:47:04 +0200
Subject: [PATCH 05/23] typo

---
 etl/steps/data/garden/democracy/2024-05-09/lexical_index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
index 35320d8ab02..905bf6f459f 100644
--- a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
@@ -52,7 +52,7 @@ def run(dest_dir: str) -> None:
             "competition_lied",
             "male_suffrage_lied",
             "female_suffrage_lied",
-            "poliberties_liead",
+            "poliberties_lied",
         ]
     ]
 

From 72502bcf6b841ed6484758900abb6c8b030625d6 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 12:40:46 +0200
Subject: [PATCH 06/23] wip

---
 .../2024-05-09/lexical_index.meta.yml         | 10 +++
 .../democracy/2024-05-09/lexical_index.py     | 73 +++++++++++++++----
 2 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
index 82cac9e820f..7f439b589c0 100644
--- a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
@@ -139,3 +139,13 @@ tables:
 
           Equivalent indicator: `female_suffrage`
 
+      suffrage_lied:
+        title: Universal suffrage
+        unit: ''
+        description_short: |-
+          Indicates if all citizens are allowed to vote in national elections. Score 0: "no universal suffrage", 1: "universal suffrage only for men" and 2: "universal suffrage for men and women".
+
+      democracy_lied:
+        title: "Electoral democracy"
+        unit: ''
+        description_short: TODO
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
index 905bf6f459f..f5517897234 100644
--- a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
@@ -22,6 +22,60 @@ def run(dest_dir: str) -> None:
     #
     # Process data.
     #
+    # Initial cleaning
+    tb = preprocess(tb)
+
+    # Format
+    tb = tb.format(["country", "year"])
+
+    ######################### WIP
+    # Create variable distinguishing between democracies and autocracies:
+    mask = tb["regime_redux_lied"] == 6
+    tb.loc[mask, "democracy_lied"] = 1
+    tb.loc[~mask, "democracy_lied"] = 0
+
+    # Create variable for age of electoral democracies
+
+    # Create variable for age of polyarchies
+
+    # Create variable for experience with electoral democracy
+
+    # Create variable for experience with polyarchy
+
+    # Create variable for age group of electoral demcoracies
+
+    # Create variable for age group of polyarchies
+
+    # Create variable for universal suffrage
+    tb.loc[(tb["male_suffrage_lied"] == 0) & (tb["female_suffrage_lied"] == 0), "suffrage_lied"] = 0
+    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 0), "suffrage_lied"] = 1
+    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 1), "suffrage_lied"] = 1.5
+    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 1), "suffrage_lied"] = 2
+
+    assert (
+        (tb["suffrage_lied"] == 1.5).sum() == 0
+    ), "There are countries with women suffrage but not men suffrage! This is not expected and can lead to confusing visualisations."
+
+    # Add labels for ages of electoral democracies and polyarchies to optimize use in the OWID grapher
+    #############################
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+
+def preprocess(tb: Table) -> Table:
+    """Pre-process data.
+
+    Includes: removing NaNs, fixing bugs, sanity checks, renaming and selecting relevant columns.
+    """
+    ## Harmonize country names
     tb = geo.harmonize_countries(
         df=tb,
         countries_file=paths.country_mapping_path,
@@ -40,7 +94,8 @@ def run(dest_dir: str) -> None:
     tb.loc[(tb["regime_lied"] == 7), "regime_redux_lied"] = 6
 
     # Select relevant columns
-    tb = tb[
+    tb = tb.loc[
+        :,
         [
             "country",
             "year",
@@ -53,22 +108,10 @@ def run(dest_dir: str) -> None:
             "male_suffrage_lied",
             "female_suffrage_lied",
             "poliberties_lied",
-        ]
+        ],
     ]
 
-    # Format
-    tb = tb.format(["country", "year"])
-
-    #
-    # Save outputs.
-    #
-    # Create a new garden dataset with the same metadata as the meadow dataset.
-    ds_garden = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
-    )
-
-    # Save changes in the new garden dataset.
-    ds_garden.save()
+    return tb
 
 
 def rename_columns(tb: Table) -> Table:

From 57681cf7160108e0e00f06d9685b322e27a7d108 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 14:36:45 +0200
Subject: [PATCH 07/23] change version to use shared tools

---
 dag/democracy.yml                                             | 4 +++-
 .../{2024-05-09 => 2024-03-07}/lexical_index.countries.json   | 0
 .../{2024-05-09 => 2024-03-07}/lexical_index.meta.yml         | 0
 .../democracy/{2024-05-09 => 2024-03-07}/lexical_index.py     | 0
 4 files changed, 3 insertions(+), 1 deletion(-)
 rename etl/steps/data/garden/democracy/{2024-05-09 => 2024-03-07}/lexical_index.countries.json (100%)
 rename etl/steps/data/garden/democracy/{2024-05-09 => 2024-03-07}/lexical_index.meta.yml (100%)
 rename etl/steps/data/garden/democracy/{2024-05-09 => 2024-03-07}/lexical_index.py (100%)

diff --git a/dag/democracy.yml b/dag/democracy.yml
index 5d535bf1196..5b757ada1fe 100644
--- a/dag/democracy.yml
+++ b/dag/democracy.yml
@@ -32,7 +32,9 @@ steps:
   # Lexcial Index (2023)
   data://meadow/democracy/2024-05-09/lexical_index:
     - snapshot://democracy/2024-05-09/lexical_index.xlsx
-  data://garden/democracy/2024-05-09/lexical_index:
+  data://garden/democracy/2024-03-07/lexical_index:
     - data://meadow/democracy/2024-05-09/lexical_index
+    - data://garden/regions/2023-01-01/regions
+    - data://garden/demography/2023-03-31/population
   data://grapher/democracy/2024-05-09/lexical_index:
     - data://garden/democracy/2024-05-09/lexical_index
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries.json
similarity index 100%
rename from etl/steps/data/garden/democracy/2024-05-09/lexical_index.countries.json
rename to etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries.json
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
similarity index 100%
rename from etl/steps/data/garden/democracy/2024-05-09/lexical_index.meta.yml
rename to etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
diff --git a/etl/steps/data/garden/democracy/2024-05-09/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
similarity index 100%
rename from etl/steps/data/garden/democracy/2024-05-09/lexical_index.py
rename to etl/steps/data/garden/democracy/2024-03-07/lexical_index.py

From 91eb63343fb0cf74be09095b076c8eefc805ba33 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 18:10:08 +0200
Subject: [PATCH 08/23] refine implemented

---
 dag/democracy.yml                             |   2 +-
 .../2024-03-07/lexical_index.meta.yml         |  70 ++++++++++--
 .../democracy/2024-03-07/lexical_index.py     | 103 +++++++++++++-----
 .../garden/democracy/2024-03-07/shared.py     |  75 ++++++++++++-
 4 files changed, 214 insertions(+), 36 deletions(-)

diff --git a/dag/democracy.yml b/dag/democracy.yml
index 5b757ada1fe..65c48f54500 100644
--- a/dag/democracy.yml
+++ b/dag/democracy.yml
@@ -37,4 +37,4 @@ steps:
     - data://garden/regions/2023-01-01/regions
     - data://garden/demography/2023-03-31/population
   data://grapher/democracy/2024-05-09/lexical_index:
-    - data://garden/democracy/2024-05-09/lexical_index
+    - data://garden/democracy/2024-03-07/lexical_index
diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index 7f439b589c0..943133cf743 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -67,9 +67,10 @@ tables:
           Equivalent indicator: ``
 
       regime_redux_lied:
-        title: Political regime (reduced)
+        title:  Politicalregime (reduced)
         unit: ''
-        description_short: TODO
+        description_short: |-
+          Identifies the political regime of a country. It distinguishes between non-electoral autocracies (score 0), one-party autocracies (score 1), multi-party autocracies without elected executive (score 2), multi-party autocracies (score 3), exclusive democracies (score 4), male democracies (score 5), electoral democracies (score 6).
         description_from_producer: |-
           We operationalize electoral democracy as a series of necessary-and-sufficient conditions arrayed in an ordinal scale. The resulting Lexical Index of Electoral Democracy (LIED). In this fashion, we arrive at an index that performs a classificatory function, each level identifies a unique and theoretically meaningful regime type, as well as a discriminating function. To generate the lexical index from the six binary variables described above, a country-year is assigned scores (0 to 6) based on the following criteria:
 
@@ -93,7 +94,7 @@ tables:
         title: Political regime
         unit: ''
         description_short: |-
-          The variable identifies the political regime of a country using the Lexical Index of Electoral Democracy by political scientists Svend-Erik Skaaning, John Gerring, and Henrikas Bartusevičius. It distinguishes between non-electoral autocracies (score 0), one-party autocracies (score 1), multi-party autocracies without elected executive (score 2), multi-party autocracies (score 3), exclusive democracies (score 4), male democracies (score 5), electoral democracies (score 6), and polyarchies (score 7).
+          Identifies the political regime of a country. It distinguishes between non-electoral autocracies (score 0), one-party autocracies (score 1), multi-party autocracies without elected executive (score 2), multi-party autocracies (score 3), exclusive democracies (score 4), male democracies (score 5), electoral democracies (score 6), and polyarchies (score 7).
         description_key:
           - In non-electoral autocracies, citizens do not have the right to elect the chief executive or the legislature.
           - In one-party autocracies, some citizens have the right to choose the chief executive or the legislature, but only have one choice.
@@ -124,7 +125,8 @@ tables:
       male_suffrage_lied:
         title: Universal suffrage for men
         unit: ''
-        description_short: TODO
+        description_short: |-
+          Indicates whether all men are allowed to vote in national elections. It neither considers informal restrictions nor legal restrictions based on age, criminal conviction, disability, and local residency.
         description_from_producer: |-
           Indicates whether virtually all male citizens are allowed to vote in national elections. Legal restrictions pertaining to age, criminal conviction, incompetence, and local residency are not considered. Informal restrictions such as those obtaining in the American South prior to 1965 are also not considered. 1=present, 0=absent.
 
@@ -133,7 +135,8 @@ tables:
       female_suffrage_lied:
         title: Universal suffrage for women
         unit: ''
-        description_short: TODO
+        description_short: |-
+          Indicates whether all women are allowed to vote in national elections. It neither considers informal restrictions nor legal restrictions based on age, criminal conviction, disability, and local residency.
         description_from_producer: |-
           Indicates whether virtually all female citizens are allowed to vote in national elections. Similar coding rules apply. 1=present, 0=absent.
 
@@ -143,9 +146,62 @@ tables:
         title: Universal suffrage
         unit: ''
         description_short: |-
-          Indicates if all citizens are allowed to vote in national elections. Score 0: "no universal suffrage", 1: "universal suffrage only for men" and 2: "universal suffrage for men and women".
+          Indicates whether virtually all men and women that are citizens are allowed to vote in national elections (score 2), whether it is only men (score 1), or there is no universal rights to vote for either men or women (score 0). It neither considers informal restrictions nor legal restrictions based on age, criminal conviction, disability, and local residency.
+        description_processing: |-
+          It combines the indicators `male_suffrage` and `female_suffrage` in Skaaning et al. (2015).
 
       democracy_lied:
         title: "Electoral democracy"
         unit: ''
-        description_short: TODO
+        description_short: |-
+          The variable identifies the political regime of a country using the reduced Lexical Index of Electoral Democracy. It distinguishes between non-electoral autocracies, one-party autocracies, multi-party autocracies without elected executive, multi-party autocracies, exclusive democracies, male democracies, and electoral democracies (including polyarchies).
+        description_key:
+          - In non-electoral autocracies, citizens do not have the right to elect the chief executive or the legislature.
+          - In one-party autocracies, some citizens have the right to choose the chief executive or the legislature, but only have one choice.
+          - Multiparty autocracies without an elected executive are otherwise one-party autocracies, but the chief executive of the government is not elected even if citizens have more than one choice in legislative elections.
+          - Multiparty autocracies are one-party autocracies in which citizens have more than one choice, though election outcome is certain.
+          - In exclusive democracies, citizens have the right to choose the chief executive and the legislature in multi-party, uncertain elections, but suffrage is restricted.
+          - Male democracies are exclusive democracies that have comprehensive suffrage for men.
+          - Electoral democracies are male democracies that also have comprehensive suffrage for women.
+
+      age_electdem_lied:
+        title: 'Age of electoral democracy'
+        unit: ''
+        description_short: |-
+          Number of consecutive years in electoral democracy.
+        description_key: &key_electdem
+          - Electoral democracies are understood here as political systems in which citizens have the right to choose the chief executive and the legislature in multi-party, uncertain elections.
+
+      group_age_electdem_lied:
+        title: 'Age of electoral democracy (category)'
+        unit: ''
+        description_short: |-
+          The variable distinguishes between non-electoral autocracies, one-party autocracies, multi-party autocracies without elected executive, multi-party autocracies, exclusive democracies, male democracies, electoral democracies aged 1-18 years, 19-30 years, 31-60 years, 61-90 years, and 91+ years in electoral democracy.
+        description_key: *key_electdem
+
+      experience_electdem_lied:
+        title: 'Experience with electoral democracy'
+        unit: ''
+        description_short: |-
+          Number of total years in electoral democracy. It sums all periods of electoral democracy.
+        description_key: *key_electdem
+
+      age_polyarchy_lied:
+        title: 'Age of polyarchy'
+        unit: ''
+        description_short: Number of consecutive years in a polyarchy.
+        description_key: &key_poly
+          - In polyarchies, citizens have the right to choose the chief executive and the legislature in multi-party, uncertain elections, and enjoy freedoms of expression, assembly, and association.
+
+      group_age_polyarchy_lied:
+        title: 'Age of polyarchy (category)'
+        unit: ''
+        description_short: |-
+          The variable distinguishes between non-electoral autocracies, one-party autocracies, multi-party autocracies without elected executive, multi-party autocracies, exclusive democracies, male democracies, electoral democracies, polyarchies aged 1-18 years, 19-30 years, 31-60 years, 61-90 years, and 91+ years in a polyarchy.
+        description_key: *key_poly
+
+      experience_polyarchy_lied:
+        title: 'Experience with polyarchy'
+        unit: ''
+        description_short: Number of total years in a polyarchy. It sums all periods of polyarchy.
+        description_key: *key_poly
diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index f5517897234..a1fdf1af5d2 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -1,6 +1,7 @@
 """Load a meadow dataset and create a garden dataset."""
 
 from owid.catalog import Table
+from shared import add_age_groups, add_count_years_in_regime
 
 from etl.data_helpers import geo
 from etl.helpers import PathFinder, create_dataset
@@ -8,6 +9,16 @@
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
 
+REGIME_LABELS = {
+    0: "non-electoral autocracy",
+    1: "one-party autocracy",
+    2: "multi-party autocracy without elected executive",
+    3: "multi-party autocracy",
+    4: "exclusive democracy",
+    5: "male democracy",
+    6: "electoral democracy",
+}
+
 
 def run(dest_dir: str) -> None:
     #
@@ -25,39 +36,22 @@ def run(dest_dir: str) -> None:
     # Initial cleaning
     tb = preprocess(tb)
 
-    # Format
-    tb = tb.format(["country", "year"])
-
-    ######################### WIP
     # Create variable distinguishing between democracies and autocracies:
-    mask = tb["regime_redux_lied"] == 6
-    tb.loc[mask, "democracy_lied"] = 1
-    tb.loc[~mask, "democracy_lied"] = 0
+    tb = add_is_democracy(tb)
 
-    # Create variable for age of electoral democracies
-
-    # Create variable for age of polyarchies
-
-    # Create variable for experience with electoral democracy
-
-    # Create variable for experience with polyarchy
-
-    # Create variable for age group of electoral demcoracies
-
-    # Create variable for age group of polyarchies
+    # Create indicators with ages and experiences (electoral democracy and polyarchy)
+    tb = add_age_and_experience(tb)
 
     # Create variable for universal suffrage
-    tb.loc[(tb["male_suffrage_lied"] == 0) & (tb["female_suffrage_lied"] == 0), "suffrage_lied"] = 0
-    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 0), "suffrage_lied"] = 1
-    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 1), "suffrage_lied"] = 1.5
-    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 1), "suffrage_lied"] = 2
+    tb = add_universal_suffrage(tb)
 
-    assert (
-        (tb["suffrage_lied"] == 1.5).sum() == 0
-    ), "There are countries with women suffrage but not men suffrage! This is not expected and can lead to confusing visualisations."
+    # Dtypes
+    tb["age_electdem_lied"] = tb["age_electdem_lied"].astype("string")
+    tb["age_polyarchy_lied"] = tb["age_polyarchy_lied"].astype("string")
+
+    # Format
+    tb = tb.format(["country", "year"])
 
-    # Add labels for ages of electoral democracies and polyarchies to optimize use in the OWID grapher
-    #############################
     #
     # Save outputs.
     #
@@ -130,3 +124,58 @@ def rename_columns(tb: Table) -> Table:
         }
     )
     return tb
+
+
+def add_is_democracy(tb: Table) -> Table:
+    """Create variable distinguishing between democracies and autocracies."""
+    mask = tb["regime_redux_lied"] == 6
+    tb.loc[mask, "democracy_lied"] = 1
+    tb.loc[~mask, "democracy_lied"] = 0
+    tb["democracy_lied"].metadata = tb["regime_redux_lied"].metadata
+    return tb
+
+
+def add_age_and_experience(tb: Table) -> Table:
+    """Add age and experience related indicators.
+
+    This includes:
+        - Number of consecutive years in electoral democracy and polyarchy (age)
+        - Number of total years in electoral democracy and polyarchy (experience)
+        - Age groups for electoral democracy and polyarchy
+    """
+    columns = [
+        ("regime_lied", "electdem_lied", 5),
+        ("regime_lied", "polyarchy_lied", 6),
+    ]
+    # Add age and experience counts
+    tb = add_count_years_in_regime(
+        tb=tb,
+        columns=columns,
+    )
+
+    for col in columns:
+        col_age = f"age_{col[1]}"
+        # Add age groups
+        tb = add_age_groups(tb=tb, column=col_age, column_raw=col[0], category_names=REGIME_LABELS, threshold=col[2])
+
+        # Replace category numbers with labels (age in *)
+        mapping = {num: label for num, label in REGIME_LABELS.items() if num <= col[2]}
+        mask = (tb[col_age] == 0) | (tb[col_age].isna())
+        tb.loc[mask, col_age] = tb.loc[mask, col[0]].replace(mapping)
+
+    return tb
+
+
+def add_universal_suffrage(tb: Table) -> Table:
+    """Add general population's suffrage rights."""
+    tb.loc[(tb["male_suffrage_lied"] == 0) & (tb["female_suffrage_lied"] == 0), "suffrage_lied"] = 0
+    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 0), "suffrage_lied"] = 1
+    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 1), "suffrage_lied"] = 1.5
+    tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 1), "suffrage_lied"] = 2
+    tb["suffrage_lied"].metadata = tb["female_suffrage_lied"].metadata
+
+    assert (
+        (tb["suffrage_lied"] == 1.5).sum() == 0
+    ), "There are countries with women suffrage but not men suffrage! This is not expected and can lead to confusing visualisations."
+
+    return tb
diff --git a/etl/steps/data/garden/democracy/2024-03-07/shared.py b/etl/steps/data/garden/democracy/2024-03-07/shared.py
index f6bd6a3aba1..62200c68758 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/shared.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/shared.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List, Optional, cast
+from typing import Any, Callable, Dict, List, Optional, Tuple, cast
 
 import numpy as np
 import pandas as pd
@@ -314,3 +314,76 @@ def add_regions_and_global_aggregates(
     tb = concat([tb_regions, tb_world], ignore_index=True, short_name="region_counts")
 
     return tb
+
+
+def add_count_years_in_regime(
+    tb: Table,
+    columns: List[Tuple[str, str, int]],
+) -> Table:
+    """Add years in a certain regime.
+
+    Two types of counters are generated:
+        - Age: Number of years consecutively with a certain regime type.
+        - Experience: Number of years with a certain regime type.
+    """
+
+    def _count_years_in_regime(tb, col, col_new, th):
+        col_th = "thresholded"
+
+        tb[col_th] = pd.cut(tb[col], bins=[-float("inf"), th, float("inf")], labels=[0, 1]).astype(int)
+        # Add age of democracy
+        tb[f"age_{col_new}"] = tb.groupby(["country", tb[col_th].fillna(0).eq(0).cumsum()])[col_th].cumsum().astype(int)
+        tb[f"age_{col_new}"] = tb[f"age_{col_new}"].copy_metadata(tb[col])
+        # Add experience with democracy
+        tb[f"experience_{col_new}"] = tb.groupby("country")[col_th].cumsum().astype(int)
+        tb[f"experience_{col_new}"] = tb[f"age_{col_new}"].copy_metadata(tb[col])
+        # Sanity check
+        assert (tb.loc[tb[col_th] == 1, f"age_{col_new}"] != 0).all(), "Negative age found!"
+        assert (tb.loc[tb[col_th] == 1, f"experience_{col_new}"] != 0).all(), "Negative age found!"
+        # Drop unused columns
+        tb = tb.drop(columns=[col_th])
+        return tb
+
+    if columns:
+        for col in columns:
+            assert len(col) == 3, "Columns should be a list of tuples with 3 elements: (colname, col_newname, col_th)"
+            tb = _count_years_in_regime(tb, *col)
+    return tb
+
+
+def add_age_groups(
+    tb: Table,
+    column: str,
+    column_raw: str,
+    threshold: int,
+    category_names: Dict[Any, str],
+    age_bins: List[int | float] | None = None,
+) -> Table:
+    """Create category for `column`."""
+    column_new = f"group_{column}"
+
+    if age_bins is None:
+        age_bins = [0, 18, 30, 60, 90, float("inf")]
+
+    # Create age group labels
+    assert len(age_bins) > 1, "There should be at least two age groups."
+    labels = []
+    for i in range(len(age_bins) - 1):
+        labels.append(f"{age_bins[i]}-{age_bins[i+1]} years")
+
+    # Create variable for age group of electoral demcoracies
+    tb[column_new] = pd.cut(
+        tb[column],
+        bins=age_bins,
+        labels=labels,
+    ).astype("string")
+
+    # Add additional categories
+    for regime_id, regime_name in category_names.items():
+        if regime_id > threshold:
+            break
+        tb.loc[(tb[column_raw] == regime_id) & tb[column_new].isna(), column_new] = regime_name
+
+    # Copy metadata
+    tb[column_new] = tb[column_new].copy_metadata(tb[column])
+    return tb

From 84a7ea71160845b778b69b803f845cfa5870aaf8 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 18:51:18 +0200
Subject: [PATCH 09/23] cache only 1 entry

---
 apps/wizard/pages/indicator_upgrade/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/wizard/pages/indicator_upgrade/utils.py b/apps/wizard/pages/indicator_upgrade/utils.py
index c1571b16d8e..e17563f4949 100644
--- a/apps/wizard/pages/indicator_upgrade/utils.py
+++ b/apps/wizard/pages/indicator_upgrade/utils.py
@@ -118,7 +118,7 @@ def get_schema() -> Dict[str, Any]:
             return schema
 
 
-@st.cache_data
+@st.cache_data(max_entries=1)
 def get_indicators_from_datasets(
     dataset_id_1: int, dataset_id_2: int, show_new_not_in_old: int = False
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:

From 9df13f5d0a336e758c9cc9e82938a4bfd8dc57bd Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 18:52:31 +0200
Subject: [PATCH 10/23] add ttl

---
 apps/wizard/pages/indicator_upgrade/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/wizard/pages/indicator_upgrade/utils.py b/apps/wizard/pages/indicator_upgrade/utils.py
index e17563f4949..179c3777f39 100644
--- a/apps/wizard/pages/indicator_upgrade/utils.py
+++ b/apps/wizard/pages/indicator_upgrade/utils.py
@@ -118,7 +118,7 @@ def get_schema() -> Dict[str, Any]:
             return schema
 
 
-@st.cache_data(max_entries=1)
+@st.cache_data(max_entries=1, ttl=60 * 10)
 def get_indicators_from_datasets(
     dataset_id_1: int, dataset_id_2: int, show_new_not_in_old: int = False
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:

From 31773df55ba8401ae678380fd020fc89ceb76f59 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 19:26:49 +0200
Subject: [PATCH 11/23] tweaks

---
 .../data/garden/democracy/2024-03-07/lexical_index.meta.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index 943133cf743..e4eff3d4a5f 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -67,7 +67,7 @@ tables:
           Equivalent indicator: ``
 
       regime_redux_lied:
-        title:  Politicalregime (reduced)
+        title:  Political regime (reduced)
         unit: ''
         description_short: |-
           Identifies the political regime of a country. It distinguishes between non-electoral autocracies (score 0), one-party autocracies (score 1), multi-party autocracies without elected executive (score 2), multi-party autocracies (score 3), exclusive democracies (score 4), male democracies (score 5), electoral democracies (score 6).
@@ -123,7 +123,7 @@ tables:
           7: polyarchy
 
       male_suffrage_lied:
-        title: Universal suffrage for men
+        title: Universal right to vote for men
         unit: ''
         description_short: |-
           Indicates whether all men are allowed to vote in national elections. It neither considers informal restrictions nor legal restrictions based on age, criminal conviction, disability, and local residency.
@@ -133,7 +133,7 @@ tables:
           Equivalent indicator: `male_suffrage`
 
       female_suffrage_lied:
-        title: Universal suffrage for women
+        title: Universal right to vote for women
         unit: ''
         description_short: |-
           Indicates whether all women are allowed to vote in national elections. It neither considers informal restrictions nor legal restrictions based on age, criminal conviction, disability, and local residency.

From 40d1c1a456efc003041aeecfab238b3c590d0c93 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 22:14:23 +0200
Subject: [PATCH 12/23] wip

---
 .../2024-03-07/lexical_index.meta.yml         |  43 ++++
 .../democracy/2024-03-07/lexical_index.py     | 229 +++++++++++++++++-
 .../garden/democracy/2024-03-07/shared.py     |   4 +-
 .../democracy/2024-05-09/lexical_index.py     |   8 +-
 4 files changed, 274 insertions(+), 10 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index e4eff3d4a5f..11693faf835 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -205,3 +205,46 @@ tables:
         unit: ''
         description_short: Number of total years in a polyarchy. It sums all periods of polyarchy.
         description_key: *key_poly
+
+  region_aggregates:
+    variables:
+      num_democracy_lied:
+        title: num_democracy_lied
+        # description_short:
+        unit: "countries"
+      num_group_age_electdem_lied:
+        title: num_group_age_electdem_lied
+        # description_short:
+        unit: "countries"
+      num_group_age_polyarchy_lied:
+        title: num_group_age_polyarchy_lied
+        # description_short:
+        unit: "countries"
+      num_regime_lied:
+        title: num_regime_lied
+        # description_short:
+        unit: "countries"
+      num_suffrage_lied:
+        title: num_suffrage_lied
+        # description_short:
+        unit: "countries"
+      pop_democracy_lied:
+        title: pop_democracy_lied
+        # description_short:
+        unit: "people"
+      pop_group_age_electdem_lied:
+        title: pop_group_age_electdem_lied
+        # description_short:
+        unit: "people"
+      pop_group_age_polyarchy_lied:
+        title: pop_group_age_polyarchy_lied
+        # description_short:
+        unit: "people"
+      pop_regime_lied:
+        title: pop_regime_lied
+        # description_short:
+        unit: "people"
+      pop_suffrage_lied:
+        title: pop_suffrage_lied
+        # description_short:
+        unit: "people"
diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index a1fdf1af5d2..b2c96e72c41 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -1,7 +1,17 @@
 """Load a meadow dataset and create a garden dataset."""
 
-from owid.catalog import Table
-from shared import add_age_groups, add_count_years_in_regime
+from typing import cast
+
+from owid.catalog import Dataset, Table
+from shared import (
+    add_age_groups,
+    add_count_years_in_regime,
+    add_population_in_dummies,
+    add_regions_and_global_aggregates,
+    expand_observations,
+    from_wide_to_long,
+    make_table_with_dummies,
+)
 
 from etl.data_helpers import geo
 from etl.helpers import PathFinder, create_dataset
@@ -19,6 +29,55 @@
     6: "electoral democracy",
 }
 
+REGIONS = {
+    "Africa": {
+        "additional_members": [
+            "Cape Colony",
+            "Natal",
+            "Orange Free State",
+            "Transvaal",
+            "Somaliland",
+            "Zanzibar",
+        ]
+    },
+    "Asia": {
+        "additional_members": [
+            "Palestine/Gaza",
+            "Palestine/West Bank",
+            "Republic of Vietnam",
+            "Democratic Republic of Vietnam",
+            "Ottoman Empire",
+            "Tibet",
+        ]
+    },
+    "North America": {
+        "additional_members": [
+            "United Provinces of Central America",
+        ]
+    },
+    "South America": {
+        "additional_members": [
+            "Great Colombia",
+        ]
+    },
+    "Europe": {
+        "additional_members": [
+            "Brunswick",
+            "Hamburg",
+            "Hesse-Darmstadt",
+            "Hesse-Kassel",
+            "Nassau",
+            "Oldenburg",
+            "Papal States",
+            "Prussia",
+            "Kingdom of Sardinia",
+            "Saxe-Weimar-Eisenach",
+            "Kingdom of the Two Sicilies",
+        ]
+    },
+    "Oceania": {},
+}
+
 
 def run(dest_dir: str) -> None:
     #
@@ -26,6 +85,8 @@ def run(dest_dir: str) -> None:
     #
     # Load meadow dataset.
     ds_meadow = paths.load_dataset("lexical_index")
+    ds_regions = paths.load_dataset("regions")
+    ds_population = paths.load_dataset("population")
 
     # Read table from meadow dataset.
     tb = ds_meadow["lexical_index"].reset_index()
@@ -49,15 +110,31 @@ def run(dest_dir: str) -> None:
     tb["age_electdem_lied"] = tb["age_electdem_lied"].astype("string")
     tb["age_polyarchy_lied"] = tb["age_polyarchy_lied"].astype("string")
 
+    # Checks on countries
+    assert set(
+        tb.loc[tb["country"].str.contains("Germany") & (tb["year"] < 1990) & (tb["year"] > 1944), "country"]
+    ) == {"East Germany", "West Germany"}, "Other versions of Germany!"
+    assert set(
+        tb.loc[tb["country"].str.contains("Germany") & ((tb["year"] >= 1990) | (tb["year"] <= 1944)), "country"]
+    ) == {"Germany"}, "Other versions of Germany!"
+
+    # Get region data
+    tb_regions = get_region_aggregates(tb, ds_regions, ds_population)
+
     # Format
     tb = tb.format(["country", "year"])
+    tb_regions = tb_regions.format(["country", "year", "category"], short_name="region_aggregates")
 
     #
     # Save outputs.
     #
     # Create a new garden dataset with the same metadata as the meadow dataset.
+    tables = [
+        tb,
+        tb_regions,
+    ]
     ds_garden = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+        dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata
     )
 
     # Save changes in the new garden dataset.
@@ -128,9 +205,9 @@ def rename_columns(tb: Table) -> Table:
 
 def add_is_democracy(tb: Table) -> Table:
     """Create variable distinguishing between democracies and autocracies."""
-    mask = tb["regime_redux_lied"] == 6
-    tb.loc[mask, "democracy_lied"] = 1
-    tb.loc[~mask, "democracy_lied"] = 0
+    tb.loc[tb["regime_redux_lied"] == 6, "democracy_lied"] = 1
+    tb.loc[(tb["regime_redux_lied"] >= 0) & (tb["regime_redux_lied"] < 6), "democracy_lied"] = 0
+    tb["democracy_lied"] = tb["democracy_lied"].astype(int)
     tb["democracy_lied"].metadata = tb["regime_redux_lied"].metadata
     return tb
 
@@ -179,3 +256,143 @@ def add_universal_suffrage(tb: Table) -> Table:
     ), "There are countries with women suffrage but not men suffrage! This is not expected and can lead to confusing visualisations."
 
     return tb
+
+
+def get_region_aggregates(tb: Table, ds_regions: Dataset, ds_population: Dataset) -> Table:
+    """Create table with region aggregates.
+
+    Includes counts of countries and counts of people living in countries"""
+    tb_ = tb.copy()
+
+    # Set INTs
+    tb_ = tb_.astype(
+        {
+            "democracy_lied": "Int64",
+            "regime_lied": "Int64",
+        }
+    )
+    tb_ = cast(Table, tb_)
+
+    # Define columns on which we will estimate (i) "number of countries" and (ii) "number of people living in ..."
+    indicators = [
+        {
+            "name": "democracy_lied",
+            "values_expected": {"0": "autorcracy", "1": "democracy"},
+            "has_na": False,
+        },
+        {
+            "name": "regime_lied",
+            "values_expected": {
+                "0": "non-electoral autorcracy",
+                "1": "one-party autocracy",
+                "2": "multi-party autocracy without elected executive",
+                "3": "multi-party autocracy",
+                "4": "exclusive democracy",
+                "5": "male democracy",
+                "6": "electoral democracy",
+                "7": "polyarchy",
+            },
+            "has_na": False,
+        },
+        {
+            "name": "suffrage_lied",
+            "values_expected": {
+                "0.0": "no suffrage",
+                "1.0": "male suffrage",
+                "2.0": "universal suffrage",
+            },
+            "has_na": False,
+        },
+    ]
+    for col in ["group_age_electdem_lied", "group_age_polyarchy_lied"]:
+        indicators.append(
+            {
+                "name": col,
+                "values_expected": {v: v for v in set(tb_[col].fillna("-1"))},
+                "has_na": False,
+            }
+        )
+
+    indicator_names = [indicator["name"] for indicator in indicators]
+
+    # 1) numbers
+    ## Make dummies
+    tb_num = make_table_with_dummies(tb_, indicators)
+
+    ## Count
+    tb_num = add_regions_and_global_aggregates(tb_num, ds_regions, regions=REGIONS)
+    tb_num = from_wide_to_long(tb_num)
+    tb_num = tb_num.rename(columns=dict(zip(indicator_names, [f"num_{i}" for i in indicator_names])))
+
+    # 2) Get people
+    ## Get missing years (not to miss anyone!) -- Note that this can lead to country overlaps (e.g. USSR and Latvia)
+    tb_pop = expand_observations_without_duplicates(tb_)
+    print(f"{tb_.shape} -> {tb_pop.shape}")
+
+    ## Make dummies
+    for ind in indicators:
+        ind["has_na"] = True
+    tb_pop = make_table_with_dummies(tb_pop, indicators)
+
+    ## Counts
+    tb_pop = add_population_in_dummies(tb_pop, ds_population)
+    tb_pop = add_regions_and_global_aggregates(tb_pop, ds_regions, regions=REGIONS)
+    tb_pop = from_wide_to_long(tb_pop)
+    tb_pop = tb_pop.rename(columns=dict(zip(indicator_names, [f"pop_{i}" for i in indicator_names])))
+    tb_pop = tb_pop[tb_pop["year"] >= 1800]
+
+    # 3) Merge
+    tb_regions = tb_num.merge(tb_pop, on=["country", "year", "category"], how="inner")
+    # assert (tb_num.shape == tb_pop.shape) and (len(tb_num) == len(tb_regions))
+    # tb_regions.loc[tb_regions["category"] == "-1", ["num_regime_ert", "num_regime_trich_ert"]] = float("nan")
+
+    return tb_regions
+
+
+def expand_observations_without_duplicates(tb: Table) -> Table:
+    tb_exp = expand_observations(tb)
+    tb_exp = tb_exp.loc[
+        ~(
+            # YUGOSLAVIA
+            ((tb_exp["country"] == "Yugoslavia") & ((tb_exp["year"] > 1990) | (tb_exp["year"] < 1918)))
+            | ((tb_exp["country"] == "Slovenia") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990)))
+            | ((tb_exp["country"] == "North Macedonia") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990)))
+            | ((tb_exp["country"] == "Croatia") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990)))
+            | ((tb_exp["country"] == "Bosnia and Herzegovina") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990)))
+            | ((tb_exp["country"] == "Serbia and Montenegro") & ((tb_exp["year"] > 2005) | (tb_exp["year"] < 1992)))
+            | ((tb_exp["country"] == "Serbia") & ((tb_exp["year"] > 1917) & (tb_exp["year"] <= 2005)))
+            | ((tb_exp["country"] == "Montenegro") & ((tb_exp["year"] > 1914) & (tb_exp["year"] <= 2005)))
+            | ((tb_exp["country"] == "Kosovo") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 2007)))
+            # YEMEN
+            | ((tb_exp["country"] == "Yemen Arab Republic") & ((tb_exp["year"] > 1989) | (tb_exp["year"] < 1940)))
+            | ((tb_exp["country"] == "Yemen People's Republic") & ((tb_exp["year"] > 1989) | (tb_exp["year"] < 1940)))
+            | ((tb_exp["country"] == "Yemen") & ((tb_exp["year"] >= 1940) & (tb_exp["year"] <= 1989)))
+            # GERMANY
+            | ((tb_exp["country"] == "West Germany") & ((tb_exp["year"] > 1989) | (tb_exp["year"] < 1945)))
+            | ((tb_exp["country"] == "East Germany") & ((tb_exp["year"] > 1989) | (tb_exp["year"] < 1945)))
+            | ((tb_exp["country"] == "Germany") & (tb_exp["year"] >= 1945) & (tb_exp["year"] <= 1989))
+            # USSR
+            | ((tb_exp["country"] == "USSR") & ((tb_exp["year"] > 1990) | (tb_exp["year"] < 1941)))
+            | ((tb_exp["country"] == "Uzbekistan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Kazakhstan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Turkmenistan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Kyrgyzstan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Tajikistan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Russia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Ukraine") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Belarus") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Moldova") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Latvia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Lithuania") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Estonia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Armenia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Georgia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            | ((tb_exp["country"] == "Azerbaijan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
+            # CZECHOSLOVAKIA
+            | ((tb_exp["country"] == "Czechoslovakia") & ((tb_exp["year"] > 1992) | (tb_exp["year"] < 1943)))
+            | ((tb_exp["country"] == "Czechia") & ((tb_exp["year"] <= 1992) | (tb_exp["year"] >= 1943)))
+            | ((tb_exp["country"] == "Slovakia") & ((tb_exp["year"] <= 1992) | (tb_exp["year"] >= 1943)))
+        ),
+    ]
+
+    return tb_exp
diff --git a/etl/steps/data/garden/democracy/2024-03-07/shared.py b/etl/steps/data/garden/democracy/2024-03-07/shared.py
index 62200c68758..d864b303349 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/shared.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/shared.py
@@ -227,7 +227,7 @@ def make_table_with_dummies(
         # Check and fix NA (convert NAs to -1 category)
         if indicator["has_na"]:
             # Assert that there are actually NaNs
-            assert tb_[indicator["name"]].isna().any(), "No NA found!"
+            assert tb_[indicator["name"]].isna().any(), f"No NA found in {indicator['name']}!"
             # If NA, we should not have category '-1', otherwise these would get merged!
             assert "-1" not in set(
                 tb_[indicator["name"]].unique()
@@ -239,7 +239,7 @@ def make_table_with_dummies(
             else:
                 values_expected |= {"-1"}
         else:
-            assert not tb_[indicator["name"]].isna().any(), "NA found!"
+            assert not tb_[indicator["name"]].isna().any(), f"NA found in {indicator['name']}!"
 
         values_found = set(tb_[indicator["name"]].unique())
         assert values_found == set(
diff --git a/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py b/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py
index d4eba37e868..13227e5d023 100644
--- a/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py
+++ b/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py
@@ -15,17 +15,21 @@ def run(dest_dir: str) -> None:
 
     # Read table from garden dataset.
     tb = ds_garden["lexical_index"]
+    tb_regions = ds_garden["region_aggregates"]
 
     #
     # Process data.
     #
-
+    tables = [
+        tb,
+        tb_regions,
+    ]
     #
     # Save outputs.
     #
     # Create a new grapher dataset with the same metadata as the garden dataset.
     ds_grapher = create_dataset(
-        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+        dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata
     )
 
     # Save changes in the new grapher dataset.

From f9f091d22df3ef416a0cc8c402401cb10a5f1560 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 22:44:14 +0200
Subject: [PATCH 13/23] working grapher import

---
 .../2024-03-07/lexical_index.meta.yml         | 71 ++++++++++++++++---
 .../democracy/2024-03-07/lexical_index.py     |  4 +-
 .../garden/democracy/2024-03-07/shared.py     |  2 +-
 3 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index 11693faf835..a937996f002 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -209,42 +209,93 @@ tables:
   region_aggregates:
     variables:
       num_democracy_lied:
-        title: num_democracy_lied
+        title: |-
+          <% if category == '-1' %>
+          Number of countries with unknown regime (democracy/autocracy)
+          <% else %>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          <% endif %>
         # description_short:
         unit: "countries"
       num_group_age_electdem_lied:
-        title: num_group_age_electdem_lied
+        title: |-
+          <% if 'years' in category %>
+          Number of electoral democracies aged << category >> (age groups electoral)
+          <% else %>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          <% endif %>
         # description_short:
         unit: "countries"
       num_group_age_polyarchy_lied:
-        title: num_group_age_polyarchy_lied
+        title: |-
+          <% if 'years' in category %>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >> (age groups polyarchy)
+          <% else %>
+          Number of polyarchies aged << category >>
+          <% endif %>
         # description_short:
         unit: "countries"
       num_regime_lied:
-        title: num_regime_lied
+        title: |-
+          <% if category == '-1' %>
+          Number of countries with unknown regime
+          <% else %>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >> (regimes)
+          <% endif %>
         # description_short:
         unit: "countries"
       num_suffrage_lied:
-        title: num_suffrage_lied
+        title: |-
+          <% if category == '-1' %>
+          Number of countries with unknown suffrage
+          <% else %>
+          Number of countries with << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          <% endif %>
         # description_short:
         unit: "countries"
+
       pop_democracy_lied:
-        title: pop_democracy_lied
+        title: |-
+          <% if category == '-1' %>
+          People living in countries with unknown regime (autocracy/democracy)
+          <% else %>
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          <% endif %>
         # description_short:
         unit: "people"
       pop_group_age_electdem_lied:
-        title: pop_group_age_electdem_lied
+        title: |-
+          <% if 'years' in category %>
+          People living in electoral democracies aged << category >>
+          <% else %>
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>  (age groups electoral)
+          <% endif %>
         # description_short:
         unit: "people"
       pop_group_age_polyarchy_lied:
-        title: pop_group_age_polyarchy_lied
+        title: |-
+          <% if 'years' in category %>
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>  (age groups polyarchy)
+          <% else %>
+          People living in polyarchies aged << category >>
+          <% endif %>
         # description_short:
         unit: "people"
       pop_regime_lied:
-        title: pop_regime_lied
+        title: |-
+          <% if category == '-1' %>
+          People living in countries with unknown regime
+          <% else %>
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          <% endif %>
         # description_short:
         unit: "people"
       pop_suffrage_lied:
-        title: pop_suffrage_lied
+        title: |-
+          <% if category == '-1' %>
+          People living in countries with unknown suffrage
+          <% else %>
+          People living in countries with << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          <% endif %>
         # description_short:
         unit: "people"
diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index b2c96e72c41..97db07ad07e 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -277,13 +277,13 @@ def get_region_aggregates(tb: Table, ds_regions: Dataset, ds_population: Dataset
     indicators = [
         {
             "name": "democracy_lied",
-            "values_expected": {"0": "autorcracy", "1": "democracy"},
+            "values_expected": {"0": "autocracy", "1": "democracy"},
             "has_na": False,
         },
         {
             "name": "regime_lied",
             "values_expected": {
-                "0": "non-electoral autorcracy",
+                "0": "non-electoral autocracy",
                 "1": "one-party autocracy",
                 "2": "multi-party autocracy without elected executive",
                 "3": "multi-party autocracy",
diff --git a/etl/steps/data/garden/democracy/2024-03-07/shared.py b/etl/steps/data/garden/democracy/2024-03-07/shared.py
index d864b303349..6a763697095 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/shared.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/shared.py
@@ -369,7 +369,7 @@ def add_age_groups(
     assert len(age_bins) > 1, "There should be at least two age groups."
     labels = []
     for i in range(len(age_bins) - 1):
-        labels.append(f"{age_bins[i]}-{age_bins[i+1]} years")
+        labels.append(f"{age_bins[i]}-{age_bins[i+1]} years".replace("-inf", "+"))
 
     # Create variable for age group of electoral demcoracies
     tb[column_new] = pd.cut(

From 48b92bf2ca4e65ad73a8270f906a66107d71f730 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 22:48:46 +0200
Subject: [PATCH 14/23] minor fixes

---
 .../democracy/2024-03-07/lexical_index.meta.yml   | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index a937996f002..fb2450821fb 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -220,18 +220,17 @@ tables:
       num_group_age_electdem_lied:
         title: |-
           <% if 'years' in category %>
-          Number of electoral democracies aged << category >> (age groups electoral)
+          Number of electoral democracies aged << category >>
           <% else %>
-          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >> (age groups electoral)
           <% endif %>
-        # description_short:
         unit: "countries"
       num_group_age_polyarchy_lied:
         title: |-
           <% if 'years' in category %>
-          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >> (age groups polyarchy)
-          <% else %>
           Number of polyarchies aged << category >>
+          <% else %>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >> (age groups polyarchy)
           <% endif %>
         # description_short:
         unit: "countries"
@@ -240,7 +239,7 @@ tables:
           <% if category == '-1' %>
           Number of countries with unknown regime
           <% else %>
-          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >> (regimes)
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >>
           <% endif %>
         # description_short:
         unit: "countries"
@@ -275,9 +274,9 @@ tables:
       pop_group_age_polyarchy_lied:
         title: |-
           <% if 'years' in category %>
-          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>  (age groups polyarchy)
-          <% else %>
           People living in polyarchies aged << category >>
+          <% else %>
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>  (age groups polyarchy)
           <% endif %>
         # description_short:
         unit: "people"

From 31c531dbdee0a3c84125d48c67ffc796d6316476 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 22:50:26 +0200
Subject: [PATCH 15/23] pre 1800 for numbers

---
 etl/steps/data/garden/democracy/2024-03-07/lexical_index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index 97db07ad07e..d57a3734c2a 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -342,7 +342,7 @@ def get_region_aggregates(tb: Table, ds_regions: Dataset, ds_population: Dataset
     tb_pop = tb_pop[tb_pop["year"] >= 1800]
 
     # 3) Merge
-    tb_regions = tb_num.merge(tb_pop, on=["country", "year", "category"], how="inner")
+    tb_regions = tb_num.merge(tb_pop, on=["country", "year", "category"], how="outer")
     # assert (tb_num.shape == tb_pop.shape) and (len(tb_num) == len(tb_regions))
     # tb_regions.loc[tb_regions["category"] == "-1", ["num_regime_ert", "num_regime_trich_ert"]] = float("nan")
 

From 6ec6ff271cb5375f746efe9b06d7e3264e68a67a Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Fri, 10 May 2024 23:11:08 +0200
Subject: [PATCH 16/23] minor fixes

---
 .../2024-03-07/lexical_index.meta.yml         | 20 +++++++++----------
 .../garden/democracy/2024-03-07/shared.py     |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index fb2450821fb..b378660ff14 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -213,7 +213,7 @@ tables:
           <% if category == '-1' %>
           Number of countries with unknown regime (democracy/autocracy)
           <% else %>
-          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>
           <% endif %>
         # description_short:
         unit: "countries"
@@ -222,7 +222,7 @@ tables:
           <% if 'years' in category %>
           Number of electoral democracies aged << category >>
           <% else %>
-          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >> (age groups electoral)
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups electoral)
           <% endif %>
         unit: "countries"
       num_group_age_polyarchy_lied:
@@ -230,7 +230,7 @@ tables:
           <% if 'years' in category %>
           Number of polyarchies aged << category >>
           <% else %>
-          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >> (age groups polyarchy)
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups polyarchy)
           <% endif %>
         # description_short:
         unit: "countries"
@@ -239,7 +239,7 @@ tables:
           <% if category == '-1' %>
           Number of countries with unknown regime
           <% else %>
-          Number of << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>
           <% endif %>
         # description_short:
         unit: "countries"
@@ -248,7 +248,7 @@ tables:
           <% if category == '-1' %>
           Number of countries with unknown suffrage
           <% else %>
-          Number of countries with << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          Number of countries with << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>
           <% endif %>
         # description_short:
         unit: "countries"
@@ -258,7 +258,7 @@ tables:
           <% if category == '-1' %>
           People living in countries with unknown regime (autocracy/democracy)
           <% else %>
-          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>
           <% endif %>
         # description_short:
         unit: "people"
@@ -267,7 +267,7 @@ tables:
           <% if 'years' in category %>
           People living in electoral democracies aged << category >>
           <% else %>
-          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>  (age groups electoral)
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>  (age groups electoral)
           <% endif %>
         # description_short:
         unit: "people"
@@ -276,7 +276,7 @@ tables:
           <% if 'years' in category %>
           People living in polyarchies aged << category >>
           <% else %>
-          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>  (age groups polyarchy)
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>  (age groups polyarchy)
           <% endif %>
         # description_short:
         unit: "people"
@@ -285,7 +285,7 @@ tables:
           <% if category == '-1' %>
           People living in countries with unknown regime
           <% else %>
-          People living in << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          People living in << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>
           <% endif %>
         # description_short:
         unit: "people"
@@ -294,7 +294,7 @@ tables:
           <% if category == '-1' %>
           People living in countries with unknown suffrage
           <% else %>
-          People living in countries with << category.replace('_', ' ').replace('cracy', 'cracies') >>
+          People living in countries with << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >>
           <% endif %>
         # description_short:
         unit: "people"
diff --git a/etl/steps/data/garden/democracy/2024-03-07/shared.py b/etl/steps/data/garden/democracy/2024-03-07/shared.py
index 6a763697095..f8f936016ca 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/shared.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/shared.py
@@ -369,7 +369,7 @@ def add_age_groups(
     assert len(age_bins) > 1, "There should be at least two age groups."
     labels = []
     for i in range(len(age_bins) - 1):
-        labels.append(f"{age_bins[i]}-{age_bins[i+1]} years".replace("-inf", "+"))
+        labels.append(f"{age_bins[i]+1}-{age_bins[i+1]} years".replace("-inf", "+"))
 
     # Create variable for age group of electoral demcoracies
     tb[column_new] = pd.cut(

From 728ad82911f2c81a8bf0f0fae2e543f3936e803b Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Sat, 11 May 2024 18:32:26 +0200
Subject: [PATCH 17/23] extend czechoslovakia, fix indicator title

---
 .../data/garden/democracy/2024-03-07/lexical_index.meta.yml | 4 ++++
 etl/steps/data/garden/democracy/2024-03-07/lexical_index.py | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index b378660ff14..47401c906b0 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -221,6 +221,8 @@ tables:
         title: |-
           <% if 'years' in category %>
           Number of electoral democracies aged << category >>
+          <% elif category == '-1' %>
+          Number of countries with unknown regime (age groups electoral)
           <% else %>
           Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups electoral)
           <% endif %>
@@ -229,6 +231,8 @@ tables:
         title: |-
           <% if 'years' in category %>
           Number of polyarchies aged << category >>
+          <% elif category == '-1' %>
+          Number of countries with unknown regime (age groups polyarchy)
           <% else %>
           Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups polyarchy)
           <% endif %>
diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index d57a3734c2a..03b59fff9e6 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -389,9 +389,9 @@ def expand_observations_without_duplicates(tb: Table) -> Table:
             | ((tb_exp["country"] == "Georgia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
             | ((tb_exp["country"] == "Azerbaijan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
             # CZECHOSLOVAKIA
-            | ((tb_exp["country"] == "Czechoslovakia") & ((tb_exp["year"] > 1992) | (tb_exp["year"] < 1943)))
-            | ((tb_exp["country"] == "Czechia") & ((tb_exp["year"] <= 1992) | (tb_exp["year"] >= 1943)))
-            | ((tb_exp["country"] == "Slovakia") & ((tb_exp["year"] <= 1992) | (tb_exp["year"] >= 1943)))
+            | ((tb_exp["country"] == "Czechoslovakia") & ((tb_exp["year"] > 1992) | (tb_exp["year"] < 1918)))
+            | ((tb_exp["country"] == "Czechia") & ((tb_exp["year"] <= 1992) | (tb_exp["year"] >= 1918)))
+            | ((tb_exp["country"] == "Slovakia") & ((tb_exp["year"] <= 1992) | (tb_exp["year"] >= 1918)))
         ),
     ]
 

From 825bd26714de87420b3c25fb4881611da34404d4 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Sat, 11 May 2024 18:37:58 +0200
Subject: [PATCH 18/23] change starting year for serbia, bug in years for
 czechia/slovakia

---
 etl/steps/data/garden/democracy/2024-03-07/lexical_index.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index 03b59fff9e6..ae515e38feb 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -359,7 +359,7 @@ def expand_observations_without_duplicates(tb: Table) -> Table:
             | ((tb_exp["country"] == "North Macedonia") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990)))
             | ((tb_exp["country"] == "Croatia") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990)))
             | ((tb_exp["country"] == "Bosnia and Herzegovina") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990)))
-            | ((tb_exp["country"] == "Serbia and Montenegro") & ((tb_exp["year"] > 2005) | (tb_exp["year"] < 1992)))
+            | ((tb_exp["country"] == "Serbia and Montenegro") & ((tb_exp["year"] > 2005) | (tb_exp["year"] <= 1990)))
             | ((tb_exp["country"] == "Serbia") & ((tb_exp["year"] > 1917) & (tb_exp["year"] <= 2005)))
             | ((tb_exp["country"] == "Montenegro") & ((tb_exp["year"] > 1914) & (tb_exp["year"] <= 2005)))
             | ((tb_exp["country"] == "Kosovo") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 2007)))
@@ -390,8 +390,8 @@ def expand_observations_without_duplicates(tb: Table) -> Table:
             | ((tb_exp["country"] == "Azerbaijan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990))
             # CZECHOSLOVAKIA
             | ((tb_exp["country"] == "Czechoslovakia") & ((tb_exp["year"] > 1992) | (tb_exp["year"] < 1918)))
-            | ((tb_exp["country"] == "Czechia") & ((tb_exp["year"] <= 1992) | (tb_exp["year"] >= 1918)))
-            | ((tb_exp["country"] == "Slovakia") & ((tb_exp["year"] <= 1992) | (tb_exp["year"] >= 1918)))
+            | ((tb_exp["country"] == "Czechia") & ((tb_exp["year"] <= 1992) & (tb_exp["year"] >= 1918)))
+            | ((tb_exp["country"] == "Slovakia") & ((tb_exp["year"] <= 1992) & (tb_exp["year"] >= 1918)))
         ),
     ]
 

From daf5e66e2c10ab1f17fbde6c23e962d57fc32685 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Sun, 12 May 2024 10:44:50 +0200
Subject: [PATCH 19/23] distribute ussr population in Asia/Europe

---
 .../democracy/2024-03-07/lexical_index.py     | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index ae515e38feb..096dbe4e017 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -1,8 +1,10 @@
 """Load a meadow dataset and create a garden dataset."""
 
+import ast
 from typing import cast
 
 from owid.catalog import Dataset, Table
+from owid.catalog.processing import concat
 from shared import (
     add_age_groups,
     add_count_years_in_regime,
@@ -334,6 +336,9 @@ def get_region_aggregates(tb: Table, ds_regions: Dataset, ds_population: Dataset
         ind["has_na"] = True
     tb_pop = make_table_with_dummies(tb_pop, indicators)
 
+    # Replace USSR -> current states
+    tb_pop = replace_ussr(tb_pop, ds_regions)
+
     ## Counts
     tb_pop = add_population_in_dummies(tb_pop, ds_population)
     tb_pop = add_regions_and_global_aggregates(tb_pop, ds_regions, regions=REGIONS)
@@ -396,3 +401,27 @@ def expand_observations_without_duplicates(tb: Table) -> Table:
     ]
 
     return tb_exp
+
+
+def replace_ussr(tb: Table, ds_regions: Dataset) -> Table:
+    tb_regions = ds_regions["regions"]
+    codes = tb_regions.loc["OWID_USS", "successors"]
+    successors = set(tb_regions.loc[ast.literal_eval(codes), "name"])
+
+    # Create new rows
+    tb_succ = []
+    for successor in successors:
+        # Copy USSR data
+        tb_ = tb.loc[(tb["country"] == "USSR")].copy()
+        # Replace country name
+        tb_["country"] = successor
+        # Append
+        tb_succ.append(tb_)
+    tb_succ = concat(tb_succ, ignore_index=True)
+
+    # Concatenate tables
+    tb = concat([tb, tb_succ], ignore_index=True).sort_values(["country", "year"])
+
+    # Remove USSR
+    tb = tb.loc[~(tb["country"] == "USSR")]
+    return tb

From 496cc2b4c89e02fe4153f01abb858c75060d249d Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Mon, 13 May 2024 12:39:27 +0200
Subject: [PATCH 20/23] impute some values

---
 .../lexical_index.countries_impute.yml        | 69 +++++++++++++++++
 .../democracy/2024-03-07/lexical_index.py     |  8 ++
 .../garden/democracy/2024-03-07/shared.py     | 76 +++++++++++++++++++
 3 files changed, 153 insertions(+)
 create mode 100644 etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml
new file mode 100644
index 00000000000..202c09cb0f8
--- /dev/null
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml
@@ -0,0 +1,69 @@
+# List specifying how to impute specific country.
+#
+# As an example:
+#
+# - country: Panama
+#   country_impute: Colombia
+#   year_min: 1832
+#   year_max: 1902
+#
+# This means that we want to inherit the classifications for Panama from Colombia between 1832 and 1902.
+#
+# We note that `country` can also be a list of countries.
+# USSR
+- country:
+    - Russia
+    - Ukraine
+    - Belarus
+    - Azerbaijan
+    - Armenia
+    - Georgia
+  country_impute: USSR
+  year_min: 1922
+  year_max: 1990
+- country:
+    - Turkmenistan
+    - Uzbekistan
+    - Kazakhstan
+    - Tajikistan
+    - Kyrgyzstan
+  country_impute: USSR
+  year_min: 1922
+  year_max: 1989
+
+- country:
+    - Lithuania
+    - Latvia
+    - Estonia
+    - Moldova
+  country_impute: USSR
+  year_min: 1940
+  year_max: 1990
+
+# Russian Empire
+- country:
+    - Belarus
+    - Georgia
+    - Turkmenistan
+    - Kazakhstan
+  country_impute: Russia
+  year_min: 1800
+  year_max: 1921
+- country:
+  - Tajikistan
+  - Kyrgyzstan
+  country_impute: Russia
+  year_min: 1868
+  year_max: 1921
+- country: Uzbekistan
+  country_impute: Russia
+  year_min: 1865
+  year_max: 1911
+- country: Moldova
+  country_impute: Russia
+  year_min: 1800
+  year_max: 1919
+- country: Azerbaijan
+  country_impute: Russia
+  year_min: 1813
+  year_max: 1921
diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index 096dbe4e017..4c0cb6e2ff4 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -8,6 +8,7 @@
 from shared import (
     add_age_groups,
     add_count_years_in_regime,
+    add_imputes,
     add_population_in_dummies,
     add_regions_and_global_aggregates,
     expand_observations,
@@ -20,6 +21,7 @@
 
 # Get paths and naming conventions for current step.
 paths = PathFinder(__file__)
+PATH_IMPUTE = paths.directory / "lexical_index.countries_impute.yml"
 
 REGIME_LABELS = {
     0: "non-electoral autocracy",
@@ -120,6 +122,12 @@ def run(dest_dir: str) -> None:
         tb.loc[tb["country"].str.contains("Germany") & ((tb["year"] >= 1990) | (tb["year"] <= 1944)), "country"]
     ) == {"Germany"}, "Other versions of Germany!"
 
+    # Impute values
+    tb = add_imputes(
+        tb=tb,
+        path=PATH_IMPUTE,
+    )
+
     # Get region data
     tb_regions = get_region_aggregates(tb, ds_regions, ds_population)
 
diff --git a/etl/steps/data/garden/democracy/2024-03-07/shared.py b/etl/steps/data/garden/democracy/2024-03-07/shared.py
index f8f936016ca..d36f5846b71 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/shared.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/shared.py
@@ -1,7 +1,9 @@
+from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, cast
 
 import numpy as np
 import pandas as pd
+import yaml
 from owid.catalog import Dataset, Table
 from owid.catalog.tables import concat
 
@@ -387,3 +389,77 @@ def add_age_groups(
     # Copy metadata
     tb[column_new] = tb[column_new].copy_metadata(tb[column])
     return tb
+
+
+def add_imputes(tb: Table, path: Path, cols_verify: List[str] | None = None) -> Table:
+    """Add imputed values to the table.
+
+    Imputed values are inferred from historical equivalents.
+
+    Example: Was "Eritrea" a democracy in 1993?
+
+        - We can infer this from "Ethiopia (former)" (historical equivalent). You can see all these mappings in bmr.countries_impute.yml file.
+
+        - This is useful to (i) be able to colour these world regions in grapher map charts, and (ii) to be able to count the number of people living in democracy (in `make_tables_population_counters`).
+
+        - Note that these "imputed country values" are ignored when estimating the number of countries in democracies (function `make_tables_country_counters`), since these countries did not exist at the time!
+    """
+    tb_ = tb.copy()
+
+    if cols_verify is None:
+        cols_verify = ["country", "year"]
+
+    # Load impute data
+    countries_impute = yaml.safe_load(path.read_text())
+
+    # Drop known values that are not correct
+
+    tb_imputed = []
+    for impute in countries_impute:
+        # Get relevant rows
+        tb_imp_ = tb_.loc[
+            (tb_["country"] == impute["country_impute"])
+            & (tb_["year"] >= impute.get("year_min", 99999))
+            & (tb_["year"] <= impute.get("year_max", -99999))
+        ].copy()
+        # Sanity checks
+        assert tb_imp_.shape[0] > 0, f"No data found for {impute['country_impute']}"
+        assert tb_imp_["year"].max() == impute["year_max"], f"Missing years (max check) for {impute['country_impute']}"
+        assert tb_imp_["year"].min() == impute["year_min"], f"Missing years (min check) for {impute['country_impute']}"
+
+        # Tweak them
+        # tb_ = tb_.rename(
+        #     columns={
+        #         "country": "regime_imputed_country",
+        #     }
+        # )
+        tb_imp_["values_imputed"] = True
+
+        # Different behaviour depending whether we have a list of countries or a single country to impute
+        if isinstance(impute["country"], list):
+            for country in impute["country"]:
+                tb_imp_["country"] = country
+                tb_imputed.append(tb_imp_.copy())
+        else:
+            tb_imp_["country"] = impute["country"]
+            tb_imputed.append(tb_imp_)
+
+    tb_ = concat(tb_imputed + [tb_], ignore_index=True)
+
+    # Set to False by default (for non-imputed countries)
+    tb_["values_imputed"] = tb_["values_imputed"].fillna(False).astype(bool)
+
+    # Re-order columns
+    # cols = [
+    #     "country",
+    #     "year",
+    #     "regime",
+    #     "regime_womsuffr",
+    #     "regime_imputed_country",
+    #     "regime_imputed",
+    # ]
+    # tb_ = cast(Table, tb_[cols])
+
+    # Verify that there are no duplicates
+    tb_ = tb_.set_index(cols_verify, verify_integrity=True).sort_index().reset_index()
+    return tb_

From 721ecc503e47b8092cef7063519c1addca3cab74 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Mon, 13 May 2024 12:45:59 +0200
Subject: [PATCH 21/23] tweaks

---
 .../democracy/2024-03-07/lexical_index.py       | 17 +++++++++++------
 .../data/garden/democracy/2024-03-07/shared.py  | 11 ++++++++---
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index 4c0cb6e2ff4..2f402098ab7 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -123,13 +123,14 @@ def run(dest_dir: str) -> None:
     ) == {"Germany"}, "Other versions of Germany!"
 
     # Impute values
-    tb = add_imputes(
-        tb=tb,
-        path=PATH_IMPUTE,
-    )
+    tb = add_imputes(tb=tb, path=PATH_IMPUTE, col_flag_imputed="values_imputed")
 
     # Get region data
-    tb_regions = get_region_aggregates(tb, ds_regions, ds_population)
+    tb_regions = tb.loc[~tb["values_imputed"]].drop(columns=["values_imputed"]).copy()
+    tb_regions = get_region_aggregates(tb_regions, ds_regions, ds_population)
+
+    # Drop is imputed flag
+    tb = tb.drop(columns=["values_imputed"])
 
     # Format
     tb = tb.format(["country", "year"])
@@ -268,7 +269,11 @@ def add_universal_suffrage(tb: Table) -> Table:
     return tb
 
 
-def get_region_aggregates(tb: Table, ds_regions: Dataset, ds_population: Dataset) -> Table:
+def get_region_aggregates(
+    tb: Table,
+    ds_regions: Dataset,
+    ds_population: Dataset,
+) -> Table:
     """Create table with region aggregates.
 
     Includes counts of countries and counts of people living in countries"""
diff --git a/etl/steps/data/garden/democracy/2024-03-07/shared.py b/etl/steps/data/garden/democracy/2024-03-07/shared.py
index d36f5846b71..7ded6380afd 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/shared.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/shared.py
@@ -391,7 +391,9 @@ def add_age_groups(
     return tb
 
 
-def add_imputes(tb: Table, path: Path, cols_verify: List[str] | None = None) -> Table:
+def add_imputes(
+    tb: Table, path: Path, cols_verify: List[str] | None = None, col_flag_imputed: str | None = None
+) -> Table:
     """Add imputed values to the table.
 
     Imputed values are inferred from historical equivalents.
@@ -406,6 +408,9 @@ def add_imputes(tb: Table, path: Path, cols_verify: List[str] | None = None) ->
     """
     tb_ = tb.copy()
 
+    if col_flag_imputed is None:
+        col_flag_imputed = "values_imputed"
+
     if cols_verify is None:
         cols_verify = ["country", "year"]
 
@@ -433,7 +438,7 @@ def add_imputes(tb: Table, path: Path, cols_verify: List[str] | None = None) ->
         #         "country": "regime_imputed_country",
         #     }
         # )
-        tb_imp_["values_imputed"] = True
+        tb_imp_[col_flag_imputed] = True
 
         # Different behaviour depending whether we have a list of countries or a single country to impute
         if isinstance(impute["country"], list):
@@ -447,7 +452,7 @@ def add_imputes(tb: Table, path: Path, cols_verify: List[str] | None = None) ->
     tb_ = concat(tb_imputed + [tb_], ignore_index=True)
 
     # Set to False by default (for non-imputed countries)
-    tb_["values_imputed"] = tb_["values_imputed"].fillna(False).astype(bool)
+    tb_[col_flag_imputed] = tb_[col_flag_imputed].fillna(False).astype(bool)
 
     # Re-order columns
     # cols = [

From a8fdc9f995b5e5baea6e508f22ae61d0429a73a3 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Mon, 13 May 2024 13:55:35 +0200
Subject: [PATCH 22/23] add note

---
 .../data/garden/democracy/2024-03-07/lexical_index.meta.yml    | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index 47401c906b0..a9a2325cefc 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -13,6 +13,9 @@ dataset:
 
 tables:
   lexical_index:
+    common:
+      description_processing: |-
+        Data for former USSR member states has been obtained by imputing the values of the USSR. This includes: Russia, Ukraine, Belarus, Latvia, Lithuania, Estonia, Armenia, Azerbaijan, Georgia, Kazakhstan, Kyrgyzstan, Tajikistan, Turkmenistan, Uzbekistan, and Moldova.
     variables:
       exelec_lied:
         title: Elections for chief executive

From f706500e118c13668479bfa8731c7f69f0891490 Mon Sep 17 00:00:00 2001
From: lucasrodes <lucasrodes@users.noreply.github.com>
Date: Mon, 13 May 2024 14:26:26 +0200
Subject: [PATCH 23/23] add data former members of czechoslovakia and
 yugoslavia

---
 .../lexical_index.countries_impute.yml        | 40 +++++++++++++++++++
 .../2024-03-07/lexical_index.meta.yml         |  4 ++
 .../democracy/2024-03-07/lexical_index.py     |  7 ++--
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml
index 202c09cb0f8..077c5a51ac9 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml
@@ -67,3 +67,43 @@
   country_impute: Russia
   year_min: 1813
   year_max: 1921
+
+# Czechoslovakia
+- country:
+    - Czechia
+    - Slovakia
+  country_impute: Czechoslovakia
+  year_min: 1918
+  year_max: 1992
+
+# Yugoslavia
+- country:
+    - Croatia
+    - Bosnia and Herzegovina
+    - Slovenia
+    - North Macedonia
+  country_impute: Yugoslavia
+  year_min: 1918
+  year_max: 1989
+- country:
+    - Serbia
+    - Montenegro
+    - Kosovo
+  country_impute: Yugoslavia
+  year_min: 1918
+  year_max: 1991
+
+# Serbia and Montenegro
+- country:
+    - Serbia
+    - Montenegro
+    - Kosovo
+  country_impute: Serbia and Montenegro
+  year_min: 1992
+  year_max: 2005
+
+# Kosovo
+- country: Kosovo
+  country_impute: Serbia
+  year_min: 2006
+  year_max: 2007
diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
index a9a2325cefc..f7e68dde8fb 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml
@@ -16,6 +16,10 @@ tables:
     common:
       description_processing: |-
         Data for former USSR member states has been obtained by imputing the values of the USSR. This includes: Russia, Ukraine, Belarus, Latvia, Lithuania, Estonia, Armenia, Azerbaijan, Georgia, Kazakhstan, Kyrgyzstan, Tajikistan, Turkmenistan, Uzbekistan, and Moldova.
+
+        Data for former Czechoslovakia member states has been obtained by imputing the values of Czechoslovakia. This includes: Czech Republic and Slovakia.
+
+        Data for former Yugoslavia member states has been obtained by imputing the values of Yugoslavia. This includes: Slovenia, Croatia, Bosnia and Herzegovina, Serbia, Montenegro, Kosovo, and North Macedonia.
     variables:
       exelec_lied:
         title: Elections for chief executive
diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
index 2f402098ab7..fddf58c2378 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py
@@ -123,14 +123,15 @@ def run(dest_dir: str) -> None:
     ) == {"Germany"}, "Other versions of Germany!"
 
     # Impute values
-    tb = add_imputes(tb=tb, path=PATH_IMPUTE, col_flag_imputed="values_imputed")
+    col_flag_imputed = "values_imputed"
+    tb = add_imputes(tb=tb, path=PATH_IMPUTE, col_flag_imputed=col_flag_imputed)
 
     # Get region data
-    tb_regions = tb.loc[~tb["values_imputed"]].drop(columns=["values_imputed"]).copy()
+    tb_regions = tb.loc[~tb[col_flag_imputed]].drop(columns=[col_flag_imputed]).copy()
     tb_regions = get_region_aggregates(tb_regions, ds_regions, ds_population)
 
     # Drop is imputed flag
-    tb = tb.drop(columns=["values_imputed"])
+    tb = tb.drop(columns=[col_flag_imputed])
 
     # Format
     tb = tb.format(["country", "year"])