From 47842188babe03dfa52513878561d251015f1554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucas=20Rod=C3=A9s-Guirao?= Date: Mon, 13 May 2024 16:44:29 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=8A=20democracy:=20lexical=20index=20(?= =?UTF-8?q?#2634)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * snapshot * wip * avoid notebook creation * clean step + metadata * typo * wip * change version to use shared tools * refine implemented * cache only 1 entry * add ttl * tweaks * wip * working grapher import * minor fixes * pre 1800 for numbers * minor fixes * extend czechoslovakia, fix indicator title * change starting year for serbia, bug in years for czechia/slovakia * distribute ussr population in Asia/Europe * impute some values * tweaks * add note * add data former members of czechoslovakia and yugoslavia --- apps/wizard/pages/indicator_upgrade/utils.py | 2 +- dag/democracy.yml | 10 + .../2024-03-07/lexical_index.countries.json | 226 +++++++++ .../lexical_index.countries_impute.yml | 109 +++++ .../2024-03-07/lexical_index.meta.yml | 311 ++++++++++++ .../democracy/2024-03-07/lexical_index.py | 441 ++++++++++++++++++ .../garden/democracy/2024-03-07/shared.py | 160 ++++++- .../democracy/2024-05-09/lexical_index.py | 36 ++ .../democracy/2024-05-09/lexical_index.py | 37 ++ .../democracy/2024-05-09/lexical_index.py | 29 ++ .../2024-05-09/lexical_index.xlsx.dvc | 34 ++ 11 files changed, 1391 insertions(+), 4 deletions(-) create mode 100644 etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries.json create mode 100644 etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml create mode 100644 etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml create mode 100644 etl/steps/data/garden/democracy/2024-03-07/lexical_index.py create mode 100644 etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py create mode 100644 etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py create mode 100644 snapshots/democracy/2024-05-09/lexical_index.py create mode 100644 snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc diff --git a/apps/wizard/pages/indicator_upgrade/utils.py b/apps/wizard/pages/indicator_upgrade/utils.py index c1571b16d8e..179c3777f39 100644 --- a/apps/wizard/pages/indicator_upgrade/utils.py +++ b/apps/wizard/pages/indicator_upgrade/utils.py @@ -118,7 +118,7 @@ def get_schema() -> Dict[str, Any]: return schema -@st.cache_data +@st.cache_data(max_entries=1, ttl=60 * 10) def get_indicators_from_datasets( dataset_id_1: int, dataset_id_2: int, show_new_not_in_old: int = False ) -> Tuple[pd.DataFrame, pd.DataFrame]: diff --git a/dag/democracy.yml b/dag/democracy.yml index 45380d019fd..65c48f54500 100644 --- a/dag/democracy.yml +++ b/dag/democracy.yml @@ -28,3 +28,13 @@ steps: - data://garden/demography/2023-03-31/population data://grapher/democracy/2024-05-01/ert: - data://garden/democracy/2024-03-07/ert + + # Lexcial Index (2023) + data://meadow/democracy/2024-05-09/lexical_index: + - snapshot://democracy/2024-05-09/lexical_index.xlsx + data://garden/democracy/2024-03-07/lexical_index: + - data://meadow/democracy/2024-05-09/lexical_index + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population + data://grapher/democracy/2024-05-09/lexical_index: + - data://garden/democracy/2024-03-07/lexical_index diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries.json b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries.json new file mode 100644 index 00000000000..d76660c9b38 --- /dev/null +++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries.json @@ -0,0 +1,226 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Austria-Hungary": "Austria-Hungary", + "Azerbaijan": "Azerbaijan", + "Baden": "Grand Duchy of Baden", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Bavaria": "Kingdom of Bavaria", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Costa Rica": "Costa Rica", + "Cote d'Ivoire": "Cote d'Ivoire", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Czechoslovakia": "Czechoslovakia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "East Timor": "East Timor", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Germany, East": "East Germany", + "Germany, West": "West Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macedonia": "North Macedonia", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia": "Micronesia (country)", + "Modena": "Duchy of Modena and Reggio", + "Moldova": "Moldova", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Norway": "Norway", + "Oman": "Oman", + "Orange Free State": "Orange Free State", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Parma": "Duchy of Parma and Piacenza", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Sardinia": "Kingdom of Sardinia", + "Saudi Arabia": "Saudi Arabia", + "Saxony": "Kingdom of Saxony", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sicily": "Kingdom of the Two Sicilies", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "Somaliland": "Somaliland", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "St. Kitts and Nevis": "Saint Kitts and Nevis", + "St. Lucia": "Saint Lucia", + "St. Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Tuscany": "Grand Duchy of Tuscany", + "Tuvalu": "Tuvalu", + "USSR": "USSR", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "Wuerttemberg": "Kingdom of Wurttemberg", + "Yemen": "Yemen", + "Yugoslavia": "Yugoslavia", + "Zambia": "Zambia", + "Zanzibar": "Zanzibar", + "Zimbabwe": "Zimbabwe", + "Congo Brazzaville": "Congo", + "Congo, Democratic Republic": "Democratic Republic of Congo", + "Gran Colombia": "Great Colombia", + "Korea": "Korea (former)", + "Korea, North": "North Korea", + "Korea, South": "South Korea", + "Mecklenburg-Schwerin": "Mecklenburg Schwerin", + "Palestine/British Mandate": "Palestine", + "Papal states, the": "Vatican", + "Sahrawi": "Western Sahara", + "Serbia-Montenegro": "Serbia and Montenegro", + "Vietnam, North": "Democratic Republic of Vietnam", + "Vietnam, South": "Republic of Vietnam", + "Yemen, North": "Yemen Arab Republic", + "Yemen, South": "Yemen People's Republic" +} \ No newline at end of file diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml new file mode 100644 index 00000000000..077c5a51ac9 --- /dev/null +++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.countries_impute.yml @@ -0,0 +1,109 @@ +# List specifying how to impute specific country. +# +# As an example: +# +# - country: Panama +# country_impute: Colombia +# year_min: 1832 +# year_max: 1902 +# +# This means that we want to inherit the classifications for Panama from Colombia between 1832 and 1902. +# +# We note that `country` can also be a list of countries. +# USSR +- country: + - Russia + - Ukraine + - Belarus + - Azerbaijan + - Armenia + - Georgia + country_impute: USSR + year_min: 1922 + year_max: 1990 +- country: + - Turkmenistan + - Uzbekistan + - Kazakhstan + - Tajikistan + - Kyrgyzstan + country_impute: USSR + year_min: 1922 + year_max: 1989 + +- country: + - Lithuania + - Latvia + - Estonia + - Moldova + country_impute: USSR + year_min: 1940 + year_max: 1990 + +# Russian Empire +- country: + - Belarus + - Georgia + - Turkmenistan + - Kazakhstan + country_impute: Russia + year_min: 1800 + year_max: 1921 +- country: + - Tajikistan + - Kyrgyzstan + country_impute: Russia + year_min: 1868 + year_max: 1921 +- country: Uzbekistan + country_impute: Russia + year_min: 1865 + year_max: 1911 +- country: Moldova + country_impute: Russia + year_min: 1800 + year_max: 1919 +- country: Azerbaijan + country_impute: Russia + year_min: 1813 + year_max: 1921 + +# Czechoslovakia +- country: + - Czechia + - Slovakia + country_impute: Czechoslovakia + year_min: 1918 + year_max: 1992 + +# Yugoslavia +- country: + - Croatia + - Bosnia and Herzegovina + - Slovenia + - North Macedonia + country_impute: Yugoslavia + year_min: 1918 + year_max: 1989 +- country: + - Serbia + - Montenegro + - Kosovo + country_impute: Yugoslavia + year_min: 1918 + year_max: 1991 + +# Serbia and Montenegro +- country: + - Serbia + - Montenegro + - Kosovo + country_impute: Serbia and Montenegro + year_min: 1992 + year_max: 2005 + +# Kosovo +- country: Kosovo + country_impute: Serbia + year_min: 2006 + year_max: 2007 diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml new file mode 100644 index 00000000000..f7e68dde8fb --- /dev/null +++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.meta.yml @@ -0,0 +1,311 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Democracy + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + lexical_index: + common: + description_processing: |- + Data for former USSR member states has been obtained by imputing the values of the USSR. This includes: Russia, Ukraine, Belarus, Latvia, Lithuania, Estonia, Armenia, Azerbaijan, Georgia, Kazakhstan, Kyrgyzstan, Tajikistan, Turkmenistan, Uzbekistan, and Moldova. + + Data for former Czechoslovakia member states has been obtained by imputing the values of Czechoslovakia. This includes: Czech Republic and Slovakia. + + Data for former Yugoslavia member states has been obtained by imputing the values of Yugoslavia. This includes: Slovenia, Croatia, Bosnia and Herzegovina, Serbia, Montenegro, Kosovo, and North Macedonia. + variables: + exelec_lied: + title: Elections for chief executive + unit: '' + description_short: |- + The variable indicates whether some citizens directly or indirectly elect the chief executive. It considers political system which do not govern themselves — such as due to external interventions, occupations, or colonization — as not holding executive elections. + description_from_producer: |- + Indicates whether the chief executive is either directly or indirectly elected (i.e., chosen by people who have been elected). This indicator takes into account whether executive power is responsible to an elected parliament if the executive is not directly elected, a situation generated by a series of historical and contemporary monarchies and principalities. Episodes of international supervision or domination following international interventions, occupation, or colonization, meaning that the polity does practice exercise self-government, are also understood as disqualifying. 1=present, 0=absent. + + Equivalent indicator: `executive_elections` + + legelec_lied: + title: Legislative elections + unit: '' + description_short: |- + The variable indicates whether some citizens elect a legislature which does issue some laws, but does not perform executive functions. + description_from_producer: |- + Indicates whether a legislative body, a parliament, issues at least some laws and does not perform executive functions. The lower house (or unicameral chamber) of the legislature is at least partly elected. The legislature has not been closed. 1=present, 0=absent. + + Equivalent indicator: `legislative_elections` + + opposition_lied: + title: Political opposition + unit: '' + description_short: |- + Indicates whether more than one party or non-party candidate are able to compete in elections for the legislature. + description_from_producer: |- + Indicates whether the lower house (or unicameral chamber) of the legislature is (at least in part) elected by voters facing more than one choice. Specifically, parties are not banned and (a) more than one party, including opposition parties, are allowed to compete or (b) candidates run without party labels but represent distinct political positions. 1=present, 0=absent. + + Equivalent indicator: `multiparty_legislative_election` + + competition_lied: + title: Competitive elections + unit: '' + description_short: |- + Whether the outcomes of elections are uncertain because their timing is not violated, voters are not systematically coerced, and election fraud is not consequential. + description_key: + - It considers the incumbent changing after multi-party elections as a strong indicator, but neither necessary nor sufficient. + - It does not consider whether all contestants have access to funding and the media, and media coverage is unbiased. + description_from_producer: |- + The chief executive offices and seats in the effective legislative body are filled by elections characterized by uncertainty, meaning that the elections are, in principle, sufficiently free to enable the opposition to gain power if they were to attract sufficient support from the electorate. This presumes that control over key executive and legislative offices is determined by elections, the executive and members of the legislature have not been unconstitutionally removed, and the legislature has not been dissolved. With respect to the electoral process, this presumes that the constitutional timing of elections has not been violated (in a more than marginal fashion), non- extremist parties are not banned, opposition candidates are generally free to participate, voters experience little systematic coercion in exercising their electoral choice, and electoral fraud does not determine who wins. With respect to the outcome, this presumes that the declared winner of executive and legislative elections reflects the votes cast by the electorate, as near as can be determined from extant sources. Incumbent turnover (as a result of multi-party elections) is regarded as a strong indicator of competition, but is neither necessary nor sufficient. In addition, we rely on reports from outside observers (as reported in books, articles, and country reports) about whether the foregoing conditions have been met in a given election. Coding for this variable does not take into account whether there is a level playing field, whether all contestants gain access to funding and media, whether media coverage is unbiased, whether civil liberties are respected, or other features associated with fully free and fair elections. 1=present, 0=absent. + + Equivalent indicator: `competitive_elections` + + poliberties_lied: + title: Political liberties + unit: '' + description_short: Indicates whether the freedoms of expression, assembly, and association are respected. + description_from_producer: |- + Freedom of expression, freedom of assembly, and freedom of association are respected. All groups, which are not openly anti-democratic, are allowed to organize freely and to assemble peacefully, and free speech, including critique of government and state-authorities, is tolerated and practiced freely by individuals and groups, including private as well as public media outlets. 1=present, 0=absent. + + Equivalent indicator: `` + + regime_redux_lied: + title: Political regime (reduced) + unit: '' + description_short: |- + Identifies the political regime of a country. It distinguishes between non-electoral autocracies (score 0), one-party autocracies (score 1), multi-party autocracies without elected executive (score 2), multi-party autocracies (score 3), exclusive democracies (score 4), male democracies (score 5), electoral democracies (score 6). + description_from_producer: |- + We operationalize electoral democracy as a series of necessary-and-sufficient conditions arrayed in an ordinal scale. The resulting Lexical Index of Electoral Democracy (LIED). In this fashion, we arrive at an index that performs a classificatory function, each level identifies a unique and theoretically meaningful regime type, as well as a discriminating function. To generate the lexical index from the six binary variables described above, a country-year is assigned scores (0 to 6) based on the following criteria: + + 0: legislative_election=0 & executive_elections=0 (regime type: non-electoral autocracies) + + 1: legislative_elections=1 or executive_elections=1 & multi-party_legislative_elections=0 (regime type: one-party autocracies, few cases where executive elections are on track but there is no functioning elected parliament) + + 2: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=0 (regime type: multiparty autocracies without elected executive – generally because a monarch influences government appointment and removal or foreign powers dominate political decision-making or has significant veto powers) + + 3: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=0 (regime type: multiparty autocracies) + + 4: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=0 (regime type: exclusive democracies) + + 5: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=1 & female_suffrage=0 (regime type: male democracies) + + 6: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=1 & female_suffrage=1 (regime type: electoral democracies) + + Equivalent indicator: `lexical_index` + + regime_lied: + title: Political regime + unit: '' + description_short: |- + Identifies the political regime of a country. It distinguishes between non-electoral autocracies (score 0), one-party autocracies (score 1), multi-party autocracies without elected executive (score 2), multi-party autocracies (score 3), exclusive democracies (score 4), male democracies (score 5), electoral democracies (score 6), and polyarchies (score 7). + description_key: + - In non-electoral autocracies, citizens do not have the right to elect the chief executive or the legislature. + - In one-party autocracies, some citizens have the right to choose the chief executive or the legislature, but only have one choice. + - Multiparty autocracies without an elected executive are otherwise one-party autocracies, but the chief executive of the government is not elected even if citizens have more than one choice in legislative elections. + - Multiparty autocracies are one-party autocracies in which citizens have more than one choice, though election outcome is certain. + - In exclusive democracies, citizens have the right to choose the chief executive and the legislature in multi-party, uncertain elections, but suffrage is restricted. + - Male democracies are exclusive democracies that have comprehensive suffrage for men. + - Electoral democracies are male democracies that also have comprehensive suffrage for women. + - Polyarchies are electoral democracies that also protect the freedoms of expression, assembly, and association. + description_from_producer: |- + This index, LIED+, add an extra layer to the upper-end of LIED in the form of political liberties. This is done to distinguish between electoral democracies and polyarchies. The meaning of the scores from 0 to 5 are identical to LIED, whereas 6 and 7 refer to the following configurations of indicator values: + + 6: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=1 & female_suffrage=1 & political_liberties=0 (regime type: electoral democracies) + + 7: legislative_elections=1 & multi-party_legislative_elections=1 & executive_elections=1 & competitive_elections=1 & male_suffrage=1 & female_suffrage=1 & political_liberties=1 (regime type: polyarchies) + + Equivalent indicator: `lexical_index_plus` + description_processing: |- + 0: non-electoral autocracy + 1: one-party autocracy + 2: multi-party autocracy wihtout elected executive + 3: multi-party autocracy + 4: exclusive Democracy + 5: male democracy + 6: elecotral democracy + 7: polyarchy + + male_suffrage_lied: + title: Universal right to vote for men + unit: '' + description_short: |- + Indicates whether all men are allowed to vote in national elections. It neither considers informal restrictions nor legal restrictions based on age, criminal conviction, disability, and local residency. + description_from_producer: |- + Indicates whether virtually all male citizens are allowed to vote in national elections. Legal restrictions pertaining to age, criminal conviction, incompetence, and local residency are not considered. Informal restrictions such as those obtaining in the American South prior to 1965 are also not considered. 1=present, 0=absent. + + Equivalent indicator: `male_suffrage` + + female_suffrage_lied: + title: Universal right to vote for women + unit: '' + description_short: |- + Indicates whether all women are allowed to vote in national elections. It neither considers informal restrictions nor legal restrictions based on age, criminal conviction, disability, and local residency. + description_from_producer: |- + Indicates whether virtually all female citizens are allowed to vote in national elections. Similar coding rules apply. 1=present, 0=absent. + + Equivalent indicator: `female_suffrage` + + suffrage_lied: + title: Universal suffrage + unit: '' + description_short: |- + Indicates whether virtually all men and women that are citizens are allowed to vote in national elections (score 2), whether it is only men (score 1), or there is no universal rights to vote for either men or women (score 0). It neither considers informal restrictions nor legal restrictions based on age, criminal conviction, disability, and local residency. + description_processing: |- + It combines the indicators `male_suffrage` and `female_suffrage` in Skaaning et al. (2015). + + democracy_lied: + title: "Electoral democracy" + unit: '' + description_short: |- + The variable identifies the political regime of a country using the reduced Lexical Index of Electoral Democracy. It distinguishes between non-electoral autocracies, one-party autocracies, multi-party autocracies without elected executive, multi-party autocracies, exclusive democracies, male democracies, and electoral democracies (including polyarchies). + description_key: + - In non-electoral autocracies, citizens do not have the right to elect the chief executive or the legislature. + - In one-party autocracies, some citizens have the right to choose the chief executive or the legislature, but only have one choice. + - Multiparty autocracies without an elected executive are otherwise one-party autocracies, but the chief executive of the government is not elected even if citizens have more than one choice in legislative elections. + - Multiparty autocracies are one-party autocracies in which citizens have more than one choice, though election outcome is certain. + - In exclusive democracies, citizens have the right to choose the chief executive and the legislature in multi-party, uncertain elections, but suffrage is restricted. + - Male democracies are exclusive democracies that have comprehensive suffrage for men. + - Electoral democracies are male democracies that also have comprehensive suffrage for women. + + age_electdem_lied: + title: 'Age of electoral democracy' + unit: '' + description_short: |- + Number of consecutive years in electoral democracy. + description_key: &key_electdem + - Electoral democracies are understood here as political systems in which citizens have the right to choose the chief executive and the legislature in multi-party, uncertain elections. + + group_age_electdem_lied: + title: 'Age of electoral democracy (category)' + unit: '' + description_short: |- + The variable distinguishes between non-electoral autocracies, one-party autocracies, multi-party autocracies without elected executive, multi-party autocracies, exclusive democracies, male democracies, electoral democracies aged 1-18 years, 19-30 years, 31-60 years, 61-90 years, and 91+ years in electoral democracy. + description_key: *key_electdem + + experience_electdem_lied: + title: 'Experience with electoral democracy' + unit: '' + description_short: |- + Number of total years in electoral democracy. It sums all periods of electoral democracy. + description_key: *key_electdem + + age_polyarchy_lied: + title: 'Age of polyarchy' + unit: '' + description_short: Number of consecutive years in a polyarchy. + description_key: &key_poly + - In polyarchies, citizens have the right to choose the chief executive and the legislature in multi-party, uncertain elections, and enjoy freedoms of expression, assembly, and association. + + group_age_polyarchy_lied: + title: 'Age of polyarchy (category)' + unit: '' + description_short: |- + The variable distinguishes between non-electoral autocracies, one-party autocracies, multi-party autocracies without elected executive, multi-party autocracies, exclusive democracies, male democracies, electoral democracies, polyarchies aged 1-18 years, 19-30 years, 31-60 years, 61-90 years, and 91+ years in a polyarchy. + description_key: *key_poly + + experience_polyarchy_lied: + title: 'Experience with polyarchy' + unit: '' + description_short: Number of total years in a polyarchy. It sums all periods of polyarchy. + description_key: *key_poly + + region_aggregates: + variables: + num_democracy_lied: + title: |- + <% if category == '-1' %> + Number of countries with unknown regime (democracy/autocracy) + <% else %> + Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> + <% endif %> + # description_short: + unit: "countries" + num_group_age_electdem_lied: + title: |- + <% if 'years' in category %> + Number of electoral democracies aged << category >> + <% elif category == '-1' %> + Number of countries with unknown regime (age groups electoral) + <% else %> + Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups electoral) + <% endif %> + unit: "countries" + num_group_age_polyarchy_lied: + title: |- + <% if 'years' in category %> + Number of polyarchies aged << category >> + <% elif category == '-1' %> + Number of countries with unknown regime (age groups polyarchy) + <% else %> + Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups polyarchy) + <% endif %> + # description_short: + unit: "countries" + num_regime_lied: + title: |- + <% if category == '-1' %> + Number of countries with unknown regime + <% else %> + Number of << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> + <% endif %> + # description_short: + unit: "countries" + num_suffrage_lied: + title: |- + <% if category == '-1' %> + Number of countries with unknown suffrage + <% else %> + Number of countries with << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> + <% endif %> + # description_short: + unit: "countries" + + pop_democracy_lied: + title: |- + <% if category == '-1' %> + People living in countries with unknown regime (autocracy/democracy) + <% else %> + People living in << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> + <% endif %> + # description_short: + unit: "people" + pop_group_age_electdem_lied: + title: |- + <% if 'years' in category %> + People living in electoral democracies aged << category >> + <% else %> + People living in << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups electoral) + <% endif %> + # description_short: + unit: "people" + pop_group_age_polyarchy_lied: + title: |- + <% if 'years' in category %> + People living in polyarchies aged << category >> + <% else %> + People living in << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> (age groups polyarchy) + <% endif %> + # description_short: + unit: "people" + pop_regime_lied: + title: |- + <% if category == '-1' %> + People living in countries with unknown regime + <% else %> + People living in << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> + <% endif %> + # description_short: + unit: "people" + pop_suffrage_lied: + title: |- + <% if category == '-1' %> + People living in countries with unknown suffrage + <% else %> + People living in countries with << category.replace('_', ' ').replace('cracy', 'cracies').replace('archy', 'archies') >> + <% endif %> + # description_short: + unit: "people" diff --git a/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py new file mode 100644 index 00000000000..fddf58c2378 --- /dev/null +++ b/etl/steps/data/garden/democracy/2024-03-07/lexical_index.py @@ -0,0 +1,441 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import ast +from typing import cast + +from owid.catalog import Dataset, Table +from owid.catalog.processing import concat +from shared import ( + add_age_groups, + add_count_years_in_regime, + add_imputes, + add_population_in_dummies, + add_regions_and_global_aggregates, + expand_observations, + from_wide_to_long, + make_table_with_dummies, +) + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +PATH_IMPUTE = paths.directory / "lexical_index.countries_impute.yml" + +REGIME_LABELS = { + 0: "non-electoral autocracy", + 1: "one-party autocracy", + 2: "multi-party autocracy without elected executive", + 3: "multi-party autocracy", + 4: "exclusive democracy", + 5: "male democracy", + 6: "electoral democracy", +} + +REGIONS = { + "Africa": { + "additional_members": [ + "Cape Colony", + "Natal", + "Orange Free State", + "Transvaal", + "Somaliland", + "Zanzibar", + ] + }, + "Asia": { + "additional_members": [ + "Palestine/Gaza", + "Palestine/West Bank", + "Republic of Vietnam", + "Democratic Republic of Vietnam", + "Ottoman Empire", + "Tibet", + ] + }, + "North America": { + "additional_members": [ + "United Provinces of Central America", + ] + }, + "South America": { + "additional_members": [ + "Great Colombia", + ] + }, + "Europe": { + "additional_members": [ + "Brunswick", + "Hamburg", + "Hesse-Darmstadt", + "Hesse-Kassel", + "Nassau", + "Oldenburg", + "Papal States", + "Prussia", + "Kingdom of Sardinia", + "Saxe-Weimar-Eisenach", + "Kingdom of the Two Sicilies", + ] + }, + "Oceania": {}, +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("lexical_index") + ds_regions = paths.load_dataset("regions") + ds_population = paths.load_dataset("population") + + # Read table from meadow dataset. + tb = ds_meadow["lexical_index"].reset_index() + + # + # Process data. + # + # Initial cleaning + tb = preprocess(tb) + + # Create variable distinguishing between democracies and autocracies: + tb = add_is_democracy(tb) + + # Create indicators with ages and experiences (electoral democracy and polyarchy) + tb = add_age_and_experience(tb) + + # Create variable for universal suffrage + tb = add_universal_suffrage(tb) + + # Dtypes + tb["age_electdem_lied"] = tb["age_electdem_lied"].astype("string") + tb["age_polyarchy_lied"] = tb["age_polyarchy_lied"].astype("string") + + # Checks on countries + assert set( + tb.loc[tb["country"].str.contains("Germany") & (tb["year"] < 1990) & (tb["year"] > 1944), "country"] + ) == {"East Germany", "West Germany"}, "Other versions of Germany!" + assert set( + tb.loc[tb["country"].str.contains("Germany") & ((tb["year"] >= 1990) | (tb["year"] <= 1944)), "country"] + ) == {"Germany"}, "Other versions of Germany!" + + # Impute values + col_flag_imputed = "values_imputed" + tb = add_imputes(tb=tb, path=PATH_IMPUTE, col_flag_imputed=col_flag_imputed) + + # Get region data + tb_regions = tb.loc[~tb[col_flag_imputed]].drop(columns=[col_flag_imputed]).copy() + tb_regions = get_region_aggregates(tb_regions, ds_regions, ds_population) + + # Drop is imputed flag + tb = tb.drop(columns=[col_flag_imputed]) + + # Format + tb = tb.format(["country", "year"]) + tb_regions = tb_regions.format(["country", "year", "category"], short_name="region_aggregates") + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + tables = [ + tb, + tb_regions, + ] + ds_garden = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def preprocess(tb: Table) -> Table: + """Pre-process data. + + Includes: removing NaNs, fixing bugs, sanity checks, renaming and selecting relevant columns. + """ + ## Harmonize country names + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + # Rename columns of interest + tb = rename_columns(tb) + + # HOTFIX 2 -> 1 encoding + countries_miss_encoded = set(tb.loc[(tb["opposition_lied"] == 2) | (tb["legelec_lied"] == 2), "country"]) + assert countries_miss_encoded == {"Botswana"} + tb.loc[tb["opposition_lied"] == 2, "opposition_lied"] = 1 + tb.loc[tb["legelec_lied"] == 2, "legelec_lied"] = 1 + + # HOTFIX: if regime_lied is 7, then regime_redux_lied should be 6 + # There is an error in Argentina@2022 + tb.loc[(tb["regime_lied"] == 7), "regime_redux_lied"] = 6 + + # Select relevant columns + tb = tb.loc[ + :, + [ + "country", + "year", + "regime_lied", + "regime_redux_lied", + "exelec_lied", + "legelec_lied", + "opposition_lied", + "competition_lied", + "male_suffrage_lied", + "female_suffrage_lied", + "poliberties_lied", + ], + ] + + return tb + + +def rename_columns(tb: Table) -> Table: + """Rename variables of interest.""" + tb = tb.rename( + columns={ + "executive_elections": "exelec_lied", + "legislative_elections": "legelec_lied", + "multi_party_legislative_elections": "opposition_lied", + "competitive_elections": "competition_lied", + "political_liberties": "poliberties_lied", + "lexical_index": "regime_redux_lied", + "lexical_index_plus": "regime_lied", + "male_suffrage": "male_suffrage_lied", + "female_suffrage": "female_suffrage_lied", + } + ) + return tb + + +def add_is_democracy(tb: Table) -> Table: + """Create variable distinguishing between democracies and autocracies.""" + tb.loc[tb["regime_redux_lied"] == 6, "democracy_lied"] = 1 + tb.loc[(tb["regime_redux_lied"] >= 0) & (tb["regime_redux_lied"] < 6), "democracy_lied"] = 0 + tb["democracy_lied"] = tb["democracy_lied"].astype(int) + tb["democracy_lied"].metadata = tb["regime_redux_lied"].metadata + return tb + + +def add_age_and_experience(tb: Table) -> Table: + """Add age and experience related indicators. + + This includes: + - Number of consecutive years in electoral democracy and polyarchy (age) + - Number of total years in electoral democracy and polyarchy (experience) + - Age groups for electoral democracy and polyarchy + """ + columns = [ + ("regime_lied", "electdem_lied", 5), + ("regime_lied", "polyarchy_lied", 6), + ] + # Add age and experience counts + tb = add_count_years_in_regime( + tb=tb, + columns=columns, + ) + + for col in columns: + col_age = f"age_{col[1]}" + # Add age groups + tb = add_age_groups(tb=tb, column=col_age, column_raw=col[0], category_names=REGIME_LABELS, threshold=col[2]) + + # Replace category numbers with labels (age in *) + mapping = {num: label for num, label in REGIME_LABELS.items() if num <= col[2]} + mask = (tb[col_age] == 0) | (tb[col_age].isna()) + tb.loc[mask, col_age] = tb.loc[mask, col[0]].replace(mapping) + + return tb + + +def add_universal_suffrage(tb: Table) -> Table: + """Add general population's suffrage rights.""" + tb.loc[(tb["male_suffrage_lied"] == 0) & (tb["female_suffrage_lied"] == 0), "suffrage_lied"] = 0 + tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 0), "suffrage_lied"] = 1 + tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 1), "suffrage_lied"] = 1.5 + tb.loc[(tb["male_suffrage_lied"] == 1) & (tb["female_suffrage_lied"] == 1), "suffrage_lied"] = 2 + tb["suffrage_lied"].metadata = tb["female_suffrage_lied"].metadata + + assert ( + (tb["suffrage_lied"] == 1.5).sum() == 0 + ), "There are countries with women suffrage but not men suffrage! This is not expected and can lead to confusing visualisations." + + return tb + + +def get_region_aggregates( + tb: Table, + ds_regions: Dataset, + ds_population: Dataset, +) -> Table: + """Create table with region aggregates. + + Includes counts of countries and counts of people living in countries""" + tb_ = tb.copy() + + # Set INTs + tb_ = tb_.astype( + { + "democracy_lied": "Int64", + "regime_lied": "Int64", + } + ) + tb_ = cast(Table, tb_) + + # Define columns on which we will estimate (i) "number of countries" and (ii) "number of people living in ..." + indicators = [ + { + "name": "democracy_lied", + "values_expected": {"0": "autocracy", "1": "democracy"}, + "has_na": False, + }, + { + "name": "regime_lied", + "values_expected": { + "0": "non-electoral autocracy", + "1": "one-party autocracy", + "2": "multi-party autocracy without elected executive", + "3": "multi-party autocracy", + "4": "exclusive democracy", + "5": "male democracy", + "6": "electoral democracy", + "7": "polyarchy", + }, + "has_na": False, + }, + { + "name": "suffrage_lied", + "values_expected": { + "0.0": "no suffrage", + "1.0": "male suffrage", + "2.0": "universal suffrage", + }, + "has_na": False, + }, + ] + for col in ["group_age_electdem_lied", "group_age_polyarchy_lied"]: + indicators.append( + { + "name": col, + "values_expected": {v: v for v in set(tb_[col].fillna("-1"))}, + "has_na": False, + } + ) + + indicator_names = [indicator["name"] for indicator in indicators] + + # 1) numbers + ## Make dummies + tb_num = make_table_with_dummies(tb_, indicators) + + ## Count + tb_num = add_regions_and_global_aggregates(tb_num, ds_regions, regions=REGIONS) + tb_num = from_wide_to_long(tb_num) + tb_num = tb_num.rename(columns=dict(zip(indicator_names, [f"num_{i}" for i in indicator_names]))) + + # 2) Get people + ## Get missing years (not to miss anyone!) -- Note that this can lead to country overlaps (e.g. USSR and Latvia) + tb_pop = expand_observations_without_duplicates(tb_) + print(f"{tb_.shape} -> {tb_pop.shape}") + + ## Make dummies + for ind in indicators: + ind["has_na"] = True + tb_pop = make_table_with_dummies(tb_pop, indicators) + + # Replace USSR -> current states + tb_pop = replace_ussr(tb_pop, ds_regions) + + ## Counts + tb_pop = add_population_in_dummies(tb_pop, ds_population) + tb_pop = add_regions_and_global_aggregates(tb_pop, ds_regions, regions=REGIONS) + tb_pop = from_wide_to_long(tb_pop) + tb_pop = tb_pop.rename(columns=dict(zip(indicator_names, [f"pop_{i}" for i in indicator_names]))) + tb_pop = tb_pop[tb_pop["year"] >= 1800] + + # 3) Merge + tb_regions = tb_num.merge(tb_pop, on=["country", "year", "category"], how="outer") + # assert (tb_num.shape == tb_pop.shape) and (len(tb_num) == len(tb_regions)) + # tb_regions.loc[tb_regions["category"] == "-1", ["num_regime_ert", "num_regime_trich_ert"]] = float("nan") + + return tb_regions + + +def expand_observations_without_duplicates(tb: Table) -> Table: + tb_exp = expand_observations(tb) + tb_exp = tb_exp.loc[ + ~( + # YUGOSLAVIA + ((tb_exp["country"] == "Yugoslavia") & ((tb_exp["year"] > 1990) | (tb_exp["year"] < 1918))) + | ((tb_exp["country"] == "Slovenia") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990))) + | ((tb_exp["country"] == "North Macedonia") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990))) + | ((tb_exp["country"] == "Croatia") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990))) + | ((tb_exp["country"] == "Bosnia and Herzegovina") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 1990))) + | ((tb_exp["country"] == "Serbia and Montenegro") & ((tb_exp["year"] > 2005) | (tb_exp["year"] <= 1990))) + | ((tb_exp["country"] == "Serbia") & ((tb_exp["year"] > 1917) & (tb_exp["year"] <= 2005))) + | ((tb_exp["country"] == "Montenegro") & ((tb_exp["year"] > 1914) & (tb_exp["year"] <= 2005))) + | ((tb_exp["country"] == "Kosovo") & ((tb_exp["year"] >= 1918) & (tb_exp["year"] <= 2007))) + # YEMEN + | ((tb_exp["country"] == "Yemen Arab Republic") & ((tb_exp["year"] > 1989) | (tb_exp["year"] < 1940))) + | ((tb_exp["country"] == "Yemen People's Republic") & ((tb_exp["year"] > 1989) | (tb_exp["year"] < 1940))) + | ((tb_exp["country"] == "Yemen") & ((tb_exp["year"] >= 1940) & (tb_exp["year"] <= 1989))) + # GERMANY + | ((tb_exp["country"] == "West Germany") & ((tb_exp["year"] > 1989) | (tb_exp["year"] < 1945))) + | ((tb_exp["country"] == "East Germany") & ((tb_exp["year"] > 1989) | (tb_exp["year"] < 1945))) + | ((tb_exp["country"] == "Germany") & (tb_exp["year"] >= 1945) & (tb_exp["year"] <= 1989)) + # USSR + | ((tb_exp["country"] == "USSR") & ((tb_exp["year"] > 1990) | (tb_exp["year"] < 1941))) + | ((tb_exp["country"] == "Uzbekistan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Kazakhstan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Turkmenistan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Kyrgyzstan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Tajikistan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Russia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Ukraine") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Belarus") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Moldova") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Latvia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Lithuania") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Estonia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Armenia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Georgia") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + | ((tb_exp["country"] == "Azerbaijan") & (tb_exp["year"] >= 1941) & (tb_exp["year"] <= 1990)) + # CZECHOSLOVAKIA + | ((tb_exp["country"] == "Czechoslovakia") & ((tb_exp["year"] > 1992) | (tb_exp["year"] < 1918))) + | ((tb_exp["country"] == "Czechia") & ((tb_exp["year"] <= 1992) & (tb_exp["year"] >= 1918))) + | ((tb_exp["country"] == "Slovakia") & ((tb_exp["year"] <= 1992) & (tb_exp["year"] >= 1918))) + ), + ] + + return tb_exp + + +def replace_ussr(tb: Table, ds_regions: Dataset) -> Table: + tb_regions = ds_regions["regions"] + codes = tb_regions.loc["OWID_USS", "successors"] + successors = set(tb_regions.loc[ast.literal_eval(codes), "name"]) + + # Create new rows + tb_succ = [] + for successor in successors: + # Copy USSR data + tb_ = tb.loc[(tb["country"] == "USSR")].copy() + # Replace country name + tb_["country"] = successor + # Append + tb_succ.append(tb_) + tb_succ = concat(tb_succ, ignore_index=True) + + # Concatenate tables + tb = concat([tb, tb_succ], ignore_index=True).sort_values(["country", "year"]) + + # Remove USSR + tb = tb.loc[~(tb["country"] == "USSR")] + return tb diff --git a/etl/steps/data/garden/democracy/2024-03-07/shared.py b/etl/steps/data/garden/democracy/2024-03-07/shared.py index f6bd6a3aba1..7ded6380afd 100644 --- a/etl/steps/data/garden/democracy/2024-03-07/shared.py +++ b/etl/steps/data/garden/democracy/2024-03-07/shared.py @@ -1,7 +1,9 @@ -from typing import Any, Callable, Dict, List, Optional, cast +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple, cast import numpy as np import pandas as pd +import yaml from owid.catalog import Dataset, Table from owid.catalog.tables import concat @@ -227,7 +229,7 @@ def make_table_with_dummies( # Check and fix NA (convert NAs to -1 category) if indicator["has_na"]: # Assert that there are actually NaNs - assert tb_[indicator["name"]].isna().any(), "No NA found!" + assert tb_[indicator["name"]].isna().any(), f"No NA found in {indicator['name']}!" # If NA, we should not have category '-1', otherwise these would get merged! assert "-1" not in set( tb_[indicator["name"]].unique() @@ -239,7 +241,7 @@ def make_table_with_dummies( else: values_expected |= {"-1"} else: - assert not tb_[indicator["name"]].isna().any(), "NA found!" + assert not tb_[indicator["name"]].isna().any(), f"NA found in {indicator['name']}!" values_found = set(tb_[indicator["name"]].unique()) assert values_found == set( @@ -314,3 +316,155 @@ def add_regions_and_global_aggregates( tb = concat([tb_regions, tb_world], ignore_index=True, short_name="region_counts") return tb + + +def add_count_years_in_regime( + tb: Table, + columns: List[Tuple[str, str, int]], +) -> Table: + """Add years in a certain regime. + + Two types of counters are generated: + - Age: Number of years consecutively with a certain regime type. + - Experience: Number of years with a certain regime type. + """ + + def _count_years_in_regime(tb, col, col_new, th): + col_th = "thresholded" + + tb[col_th] = pd.cut(tb[col], bins=[-float("inf"), th, float("inf")], labels=[0, 1]).astype(int) + # Add age of democracy + tb[f"age_{col_new}"] = tb.groupby(["country", tb[col_th].fillna(0).eq(0).cumsum()])[col_th].cumsum().astype(int) + tb[f"age_{col_new}"] = tb[f"age_{col_new}"].copy_metadata(tb[col]) + # Add experience with democracy + tb[f"experience_{col_new}"] = tb.groupby("country")[col_th].cumsum().astype(int) + tb[f"experience_{col_new}"] = tb[f"age_{col_new}"].copy_metadata(tb[col]) + # Sanity check + assert (tb.loc[tb[col_th] == 1, f"age_{col_new}"] != 0).all(), "Negative age found!" + assert (tb.loc[tb[col_th] == 1, f"experience_{col_new}"] != 0).all(), "Negative age found!" + # Drop unused columns + tb = tb.drop(columns=[col_th]) + return tb + + if columns: + for col in columns: + assert len(col) == 3, "Columns should be a list of tuples with 3 elements: (colname, col_newname, col_th)" + tb = _count_years_in_regime(tb, *col) + return tb + + +def add_age_groups( + tb: Table, + column: str, + column_raw: str, + threshold: int, + category_names: Dict[Any, str], + age_bins: List[int | float] | None = None, +) -> Table: + """Create category for `column`.""" + column_new = f"group_{column}" + + if age_bins is None: + age_bins = [0, 18, 30, 60, 90, float("inf")] + + # Create age group labels + assert len(age_bins) > 1, "There should be at least two age groups." + labels = [] + for i in range(len(age_bins) - 1): + labels.append(f"{age_bins[i]+1}-{age_bins[i+1]} years".replace("-inf", "+")) + + # Create variable for age group of electoral demcoracies + tb[column_new] = pd.cut( + tb[column], + bins=age_bins, + labels=labels, + ).astype("string") + + # Add additional categories + for regime_id, regime_name in category_names.items(): + if regime_id > threshold: + break + tb.loc[(tb[column_raw] == regime_id) & tb[column_new].isna(), column_new] = regime_name + + # Copy metadata + tb[column_new] = tb[column_new].copy_metadata(tb[column]) + return tb + + +def add_imputes( + tb: Table, path: Path, cols_verify: List[str] | None = None, col_flag_imputed: str | None = None +) -> Table: + """Add imputed values to the table. + + Imputed values are inferred from historical equivalents. + + Example: Was "Eritrea" a democracy in 1993? + + - We can infer this from "Ethiopia (former)" (historical equivalent). You can see all these mappings in bmr.countries_impute.yml file. + + - This is useful to (i) be able to colour these world regions in grapher map charts, and (ii) to be able to count the number of people living in democracy (in `make_tables_population_counters`). + + - Note that these "imputed country values" are ignored when estimating the number of countries in democracies (function `make_tables_country_counters`), since these countries did not exist at the time! + """ + tb_ = tb.copy() + + if col_flag_imputed is None: + col_flag_imputed = "values_imputed" + + if cols_verify is None: + cols_verify = ["country", "year"] + + # Load impute data + countries_impute = yaml.safe_load(path.read_text()) + + # Drop known values that are not correct + + tb_imputed = [] + for impute in countries_impute: + # Get relevant rows + tb_imp_ = tb_.loc[ + (tb_["country"] == impute["country_impute"]) + & (tb_["year"] >= impute.get("year_min", 99999)) + & (tb_["year"] <= impute.get("year_max", -99999)) + ].copy() + # Sanity checks + assert tb_imp_.shape[0] > 0, f"No data found for {impute['country_impute']}" + assert tb_imp_["year"].max() == impute["year_max"], f"Missing years (max check) for {impute['country_impute']}" + assert tb_imp_["year"].min() == impute["year_min"], f"Missing years (min check) for {impute['country_impute']}" + + # Tweak them + # tb_ = tb_.rename( + # columns={ + # "country": "regime_imputed_country", + # } + # ) + tb_imp_[col_flag_imputed] = True + + # Different behaviour depending whether we have a list of countries or a single country to impute + if isinstance(impute["country"], list): + for country in impute["country"]: + tb_imp_["country"] = country + tb_imputed.append(tb_imp_.copy()) + else: + tb_imp_["country"] = impute["country"] + tb_imputed.append(tb_imp_) + + tb_ = concat(tb_imputed + [tb_], ignore_index=True) + + # Set to False by default (for non-imputed countries) + tb_[col_flag_imputed] = tb_[col_flag_imputed].fillna(False).astype(bool) + + # Re-order columns + # cols = [ + # "country", + # "year", + # "regime", + # "regime_womsuffr", + # "regime_imputed_country", + # "regime_imputed", + # ] + # tb_ = cast(Table, tb_[cols]) + + # Verify that there are no duplicates + tb_ = tb_.set_index(cols_verify, verify_integrity=True).sort_index().reset_index() + return tb_ diff --git a/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py b/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py new file mode 100644 index 00000000000..13227e5d023 --- /dev/null +++ b/etl/steps/data/grapher/democracy/2024-05-09/lexical_index.py @@ -0,0 +1,36 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("lexical_index") + + # Read table from garden dataset. + tb = ds_garden["lexical_index"] + tb_regions = ds_garden["region_aggregates"] + + # + # Process data. + # + tables = [ + tb, + tb_regions, + ] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py b/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py new file mode 100644 index 00000000000..cb047dee3e2 --- /dev/null +++ b/etl/steps/data/meadow/democracy/2024-05-09/lexical_index.py @@ -0,0 +1,37 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("lexical_index.xlsx") + + # Load data from snapshot. + tb = snap.read() + + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.rename(columns={"countryn": "country"}) + tb = tb.format(["country", "year"]) + + # Dtype + tb.loc[tb["lexical_index"] == ",", "lexical_index"] = float("nan") + tb["lexical_index"] = tb["lexical_index"].astype("Int64") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/democracy/2024-05-09/lexical_index.py b/snapshots/democracy/2024-05-09/lexical_index.py new file mode 100644 index 00000000000..cf335448a64 --- /dev/null +++ b/snapshots/democracy/2024-05-09/lexical_index.py @@ -0,0 +1,29 @@ +"""Script to create a snapshot of dataset. + +To download, visit https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/WPKNIT&version=3.0, and download LIED_6.5.xlsx file. + +NOTE: in case this site, please look for an alternative from the provider's main site: https://ps.au.dk/en/research/research-projects/dedere/datasets (also listed in the metadata)""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"democracy/{SNAPSHOT_VERSION}/lexical_index.xlsx") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc b/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc new file mode 100644 index 00000000000..171919f4bfc --- /dev/null +++ b/snapshots/democracy/2024-05-09/lexical_index.xlsx.dvc @@ -0,0 +1,34 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Lexical Index of Electoral Democracy (LIED) + description: |- + LIED is the most comprehensive dataset on democracy in terms of country-years. It covers all + independent countries and most semi-sovereign polities and overseas colonies, protectorates, etc. within the 1789 to 2020 timespan. Scores have also been assigned to the units in the case of short-term foreign occupation. Scores for each indicator reflect the status of a country on the last day of the calendar year (31 December) and are not intended to reflect the mean value of an indicator across the previous 364 days. Coding decisions are based on country-specific sources. + + All original coding has been done by Svend-Erik Skaaning. Svend-Erik Skaaning has developed the conceptual distinctions and cumulative logic associated with the lexical index in collaboration with John Gerring. The distinctions regarding modes of democratic transition and breakdown have been developed by Svend-Erik Skaaning, 1 who has also developed the turnover variables. Henrikas Bartusevicius was in charge of empirical analyses and the coding linked to the inter-coder reliability test presented in the dataset paper (see below). + + The dataset consists of 14 original indicators and two original indices. The LIED dataset offers indicators on whether legislative elections are on track (legislative_elections), whether (direct or indirect) executive elections are on track (executive_elections), whether multiple parties are able to run for legislative elections (multi-party_legislative_elections), whether there is universal male suffrage (male_suffrage), and whether there is universal female suffrage (female_suffrage),2 whether elections are genuinely contested (competitive_elections), whether political liberties in the form of freedom of expression, assembly, and association, are respected (political_liberties), whether countries experienced democratic transition in a given year (democratic_transition), the mode of democratic transition (transition_type), whether countries experienced democratic breakdown in a given year (democratic_breakdown), the mode of democratic breakdown (breakdown_type),whether elections led to a government turnover (turnover_event), and whether a period of competitive elections has been characterized by at least one government turnover (turnover_period). Finally, the data are used to construct two indices, i.e., the Lexical Index of Electoral Democracy (lexical_index) and an extended version called Lexical Index of Electoral Democracy+ (lexical_index_plus). + date_published: "2023-07-31" + version_producer: v6.5 + + # Citation + producer: Skaaning et al. + citation_full: |- + Skaaning, Svend-Erik, John Gerring and Henrikas Bartusevičius. 2015. A Lexical Index of Electoral Democracy. Comparative Political Studies 48(12):1491-1525. + + # Files + url_main: https://ps.au.dk/en/research/research-projects/dedere/datasets + date_accessed: 2024-05-09 + + # License + license: + name: CC0 + url: https://creativecommons.org/publicdomain/zero/1.0/ + +outs: + - md5: 3956d28a242f0e9e4f3e789bfdf605fc + size: 2293527 + path: lexical_index.xlsx