From c51cdd748149e8038468c2827377b5607bdc6ea1 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 9 Apr 2024 17:00:43 +0200 Subject: [PATCH 01/61] Let ETL write execution times to hidden file, and print informative messages (#2503) * Let ETL write execution times to hidden file, and print informative messages * Show estimated time also when running etl in dry run mode --- .gitignore | 2 + etl/command.py | 118 +++++++++++++++++++++++++++++++++++++++++-------- etl/paths.py | 3 ++ 3 files changed, 105 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index 0e29c71e468..0b877817eda 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ site/ .wizardcfg/* .streamlit/* .ipynb_lock +.execution_time.json + diff --git a/etl/command.py b/etl/command.py index 62a8f005b17..efc234a57cd 100644 --- a/etl/command.py +++ b/etl/command.py @@ -5,13 +5,17 @@ import difflib import itertools +import json import re import resource import sys import time +from collections.abc import MutableMapping from concurrent.futures import FIRST_COMPLETED, Future, ProcessPoolExecutor, ThreadPoolExecutor, wait from contextlib import contextmanager +from functools import partial from graphlib import TopologicalSorter +from multiprocessing import Manager from os import environ from pathlib import Path from typing import Any, Callable, Dict, Iterator, List, Optional, Set @@ -343,11 +347,18 @@ def run_dag( print("--- All datasets up to date!") return + # Calculate total expected time for all steps (if run sequentially) + total_expected_time_seconds = sum(_get_execution_time(str(step)) or 0 for step in steps) + if dry_run: - print(f"--- Running {len(steps)} steps:") + print( + f"--- Would run {len(steps)} steps{_create_expected_time_message(total_expected_time_seconds, prepend_message=' (at least ')}:" + ) return enumerate_steps(steps) elif workers == 1: - print(f"--- Running {len(steps)} steps:") + print( + f"--- Running {len(steps)} steps{_create_expected_time_message(total_expected_time_seconds, prepend_message=' (at least ')}:" + ) return exec_steps(steps, strict=strict) else: print(f"--- Running {len(steps)} steps with {workers} processes:") @@ -355,14 +366,24 @@ def run_dag( def exec_steps(steps: List[Step], strict: Optional[bool] = None) -> None: + execution_times = {} for i, step in enumerate(steps, 1): - print(f"--- {i}. {step}...") + print(f"--- {i}. {step}{_create_expected_time_message(_get_execution_time(step_name=str(step)))}") + + # Determine strictness level for the current step strict = _detect_strictness_level(step, strict) + with strictness_level(strict): + # Execute the step and measure the time taken time_taken = timed_run(lambda: step.run()) - click.echo(f"{click.style('OK', fg='blue')} ({time_taken:.1f}s)") + execution_times[str(step)] = time_taken + + click.echo(f"{click.style('OK', fg='blue')}{_create_expected_time_message(time_taken)}") print() + # Write the recorded execution times to the file after all steps have been executed + _write_execution_times(execution_times) + def _steps_sort_key(step: Step) -> int: """Sort steps by channel, so that grapher steps are executed first, then garden, then meadow, then snapshots.""" @@ -384,16 +405,27 @@ def exec_steps_parallel(steps: List[Step], workers: int, dag: DAG, strict: Optio # the load on MySQL steps = sorted(steps, key=_steps_sort_key) - # create execution graph from steps - exec_graph = {} - steps_str = {str(step) for step in steps} - for step in steps: - # only add dependencies that are in the list of steps (i.e. are dirty) - # NOTE: we have to compare their string versions, the actual objects might have - # different attributes - exec_graph[str(step)] = {str(dep) for dep in step.dependencies if str(dep) in steps_str} + # Use a Manager dict to collect execution times in parallel execution + with Manager() as manager: + execution_times = manager.dict() + + # Create execution graph from steps + exec_graph = {} + steps_str = {str(step) for step in steps} + for step in steps: + # only add dependencies that are in the list of steps (i.e. are dirty) + # NOTE: we have to compare their string versions, the actual objects might have + # different attributes + exec_graph[str(step)] = {str(dep) for dep in step.dependencies if str(dep) in steps_str} + + # Prepare a function for execution that includes the necessary arguments + exec_func = partial(_exec_step_job, execution_times=execution_times, dag=dag, strict=strict) + + # Execute the graph of tasks in parallel + exec_graph_parallel(exec_graph, exec_func, workers) - exec_graph_parallel(exec_graph, _exec_step_job, workers, dag=dag, strict=strict) + # After all tasks have completed, write the execution times to the file + _write_execution_times(dict(execution_times)) def exec_graph_parallel( @@ -433,7 +465,24 @@ def exec_graph_parallel( topological_sorter.done(task) -def _exec_step_job(step_name: str, dag: Optional[DAG] = None, strict: Optional[bool] = None) -> None: +def _create_expected_time_message( + expected_time: Optional[float], prepend_message: str = " (", append_message: str = ")" +) -> str: + minutes, seconds = divmod(expected_time or 0, 60) + if minutes < 1: + partial_message = f"{seconds:.1f}s" + else: + partial_message = f"{int(minutes)}m{seconds: .1f}s" + + if (expected_time is None) or (expected_time == 0): + return "" + else: + return prepend_message + partial_message + append_message + + +def _exec_step_job( + step_name: str, execution_times: MutableMapping, dag: Optional[DAG] = None, strict: Optional[bool] = None +) -> None: """ Executes a step. @@ -441,19 +490,52 @@ def _exec_step_job(step_name: str, dag: Optional[DAG] = None, strict: Optional[b :param dag: The original DAG used to create Step object. This must be the same DAG as given to ETL. :param strict: The strictness level for the step execution. """ - print(f"--- Starting {step_name}", flush=True) + print(f"--- Starting {step_name}{_create_expected_time_message(_get_execution_time(step_name))}") assert dag step = parse_step(step_name, dag) strict = _detect_strictness_level(step, strict) with strictness_level(strict): - time_taken = timed_run(lambda: step.run()) + execution_times[step_name] = timed_run(lambda: step.run()) + print(f"--- Finished {step_name} ({execution_times[step_name]:.1f}s)") + + +def _write_execution_times(execution_times: Dict) -> None: + # Write the recorded execution times to a hidden json file that contains the time it took to execute each step + execution_time_file = paths.EXECUTION_TIME_FILE + if execution_time_file.exists(): + with open(execution_time_file, "r") as file: + stored_times = json.load(file) + else: + stored_times = {} - print(f"--- Finished {step_name} ({time_taken:.0f}s)", flush=True) + stored_times.update(execution_times) + with open(execution_time_file, "w") as file: + json.dump(stored_times, file, indent=4, sort_keys=True) + + +def _get_step_identifier(step_name: str) -> str: + return step_name.replace(step_name.split("/")[-2] + "/", "") + + +def _get_execution_time(step_name: str) -> Optional[float]: + # Read execution time of a given step from the hidden json file + # If it doesn't exist, try to read another version of the same step, and if no other version exists, return None + if not paths.EXECUTION_TIME_FILE.exists(): + return None + else: + with open(paths.EXECUTION_TIME_FILE, "r") as file: + execution_times = json.load(file) + execution_time = execution_times.get(step_name) + if not execution_time: + # If the step has not been timed yet, try to find a previous version + step_identifiers = {_get_step_identifier(step): value for step, value in execution_times.items()} + execution_time = step_identifiers.get(_get_step_identifier(step_name)) + return execution_time def enumerate_steps(steps: List[Step]) -> None: for i, step in enumerate(steps, 1): - print(f"{i}. {step}") + print(f"{i}. {step}{_create_expected_time_message(_get_execution_time(str(step)))}") def _detect_strictness_level(step: Step, strict: Optional[bool] = None) -> bool: diff --git a/etl/paths.py b/etl/paths.py index 30e465b6feb..a3fa8889535 100644 --- a/etl/paths.py +++ b/etl/paths.py @@ -62,3 +62,6 @@ # Use paths.DAG_ARCHIVE_FILE to load the complete dag, with active and archive steps. # Otherwise use paths.DAG_FILE to load only active steps, ignoring archive ones. DEFAULT_DAG_FILE = DAG_FILE + +# Hidden ETL file that will keep the time it took to execute each step. +EXECUTION_TIME_FILE = BASE_DIR / ".execution_time.json" From 4e612809e48f5f88aeb9d663771000cabad98c11 Mon Sep 17 00:00:00 2001 From: owidbot Date: Wed, 10 Apr 2024 04:03:34 +0000 Subject: [PATCH 02/61] :robot: automatic wildfires update --- snapshots/climate/latest/weekly_wildfires.csv.dvc | 11 +++++------ snapshots/excess_mortality/latest/hmd_stmf.csv.dvc | 2 +- snapshots/excess_mortality/latest/wmd.csv.dvc | 2 +- .../latest/xm_karlinsky_kobak.csv.dvc | 2 +- .../latest/xm_karlinsky_kobak_ages.csv.dvc | 2 +- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 2f35edebadf..45fb735c98e 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -5,17 +5,16 @@ meta: description: |- The dataset provides a weekly comprehensive overview of fire activity and its environmental impact, incorporating data from the Global Wildfire Information System (GWIS) and satellite imagery from MODIS and VIIRS. It includes metrics such as the area of land burnt, cumulative burnt areas, carbon dioxide emissions from fires, cumulative carbon emissions, the number of fires, and cumulative fire counts. title_snapshot: Seasonal wildfire trends (2024 and later) - description_snapshot: This dataset focuses specifically on older data. A separate snapshot will be created to add more - recent data. + description_snapshot: This dataset focuses specifically on older data. A separate snapshot will be created to add more recent data. citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-04-09 - date_published: 2024-04-09 + date_accessed: 2024-04-10 + date_published: 2024-04-10 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license outs: - - md5: 65a4703accc44038d0f82b83879b006f - size: 11611371 + - md5: 1bc963ac2662d95647d5d69942a1d416 + size: 11623135 path: weekly_wildfires.csv diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index aced7229bef..070ccb99ce4 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,7 +13,7 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-04-09 + date_accessed: 2024-04-10 publication_date: 2024-03-18 publication_year: 2024 published_by: |- diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index b697b7fa6d3..8f957225ab9 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-04-09 + date_accessed: 2024-04-10 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 44b27194cad..238de18b5f5 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-04-09 + date_accessed: 2024-04-10 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index 72e71085f99..9d1cd01c148 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-04-09 + date_accessed: 2024-04-10 publication_date: '2021-06-30' publication_year: 2021 published_by: |- From c2a05f92259fd058eb4f1f5ba1a9bf440e20b72f Mon Sep 17 00:00:00 2001 From: owidbot Date: Wed, 10 Apr 2024 04:05:46 +0000 Subject: [PATCH 03/61] :robot: automatic flunet update --- snapshots/who/latest/fluid.csv.dvc | 4 ++-- snapshots/who/latest/flunet.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index 4dad17eec08..d4df3f712c6 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: 7ea6a347dd2cfff19b73c86c94685cec - size: 150179792 + - md5: c871c20f9342720af8d2634b4641d004 + size: 150197770 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index 164d270369f..c5c0f09bca9 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: b4ddf1f92ee41abb6c060264b93bc487 - size: 25727681 + - md5: 61a80a627866aec81a5fd99e8f169041 + size: 25729116 path: flunet.csv From ae097eebdfe8c044ff53cc535cf12398e0059597 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 10 Apr 2024 12:28:12 +0200 Subject: [PATCH 04/61] =?UTF-8?q?=F0=9F=93=8A=20Update=20dataset=20on=20na?= =?UTF-8?q?tional=20contributions=20to=20climate=20change=20reference=20br?= =?UTF-8?q?anch=20(#2501)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Duplicate code from previous version of national contributions to global warming dataset * Update dataset on national contributions to climate change (#2496) * Adapt snapshots, meadow, garden and grapher steps * Fix spurious negative numbers * Improve format * Various small improvements, following Pablo A suggestions --- dag/emissions.yml | 42 +- .../national_contributions.countries.json | 227 ++++++++++ ...onal_contributions.excluded_countries.json | 9 + .../national_contributions.meta.yml | 428 ++++++++++++++++++ .../2024-04-08/national_contributions.py | 354 +++++++++++++++ .../2024-04-08/national_contributions.py | 22 + .../2024-04-08/national_contributions.py | 50 ++ .../2024-04-08/national_contributions.py | 108 +++++ ...nal_contributions_annual_emissions.csv.dvc | 33 ++ ...contributions_cumulative_emissions.csv.dvc | 33 ++ ...contributions_temperature_response.csv.dvc | 33 ++ 11 files changed, 1325 insertions(+), 14 deletions(-) create mode 100644 etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json create mode 100644 etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json create mode 100644 etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml create mode 100644 etl/steps/data/garden/emissions/2024-04-08/national_contributions.py create mode 100644 etl/steps/data/grapher/emissions/2024-04-08/national_contributions.py create mode 100644 etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py create mode 100644 snapshots/emissions/2024-04-08/national_contributions.py create mode 100644 snapshots/emissions/2024-04-08/national_contributions_annual_emissions.csv.dvc create mode 100644 snapshots/emissions/2024-04-08/national_contributions_cumulative_emissions.csv.dvc create mode 100644 snapshots/emissions/2024-04-08/national_contributions_temperature_response.csv.dvc diff --git a/dag/emissions.yml b/dag/emissions.yml index 623547c5067..5849d3f8642 100644 --- a/dag/emissions.yml +++ b/dag/emissions.yml @@ -76,20 +76,6 @@ steps: data://grapher/rff/2023-10-19/emissions_weighted_carbon_price: - data://garden/rff/2023-10-19/emissions_weighted_carbon_price # - # Jones et al. (2023) - National contributions to climate change. - # - data://meadow/emissions/2023-11-23/national_contributions: - - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv - - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv - - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv - data://garden/emissions/2023-11-23/national_contributions: - - data://meadow/emissions/2023-11-23/national_contributions - - data://garden/regions/2023-01-01/regions - - data://garden/demography/2023-03-31/population - - data://garden/wb/2023-04-30/income_groups - data://grapher/emissions/2023-11-23/national_contributions: - - data://garden/emissions/2023-11-23/national_contributions - # # IPCC - Emission Factor Database (2023-10-24). # data://meadow/emissions/2023-10-24/emission_factors: @@ -124,9 +110,37 @@ steps: data://garden/emissions/2024-02-26/gdp_and_co2_decoupling: - data://garden/gcp/2023-12-12/global_carbon_budget - data://garden/worldbank_wdi/2023-05-29/wdi + # + # Jones et al. - National contributions to climate change. + # + data://meadow/emissions/2024-04-08/national_contributions: + - snapshot://emissions/2024-04-08/national_contributions_temperature_response.csv + - snapshot://emissions/2024-04-08/national_contributions_cumulative_emissions.csv + - snapshot://emissions/2024-04-08/national_contributions_annual_emissions.csv + data://garden/emissions/2024-04-08/national_contributions: + - data://meadow/emissions/2024-04-08/national_contributions + - data://garden/demography/2023-03-31/population + - data://garden/wb/2024-03-11/income_groups + - data://garden/regions/2023-01-01/regions + data://grapher/emissions/2024-04-08/national_contributions: + - data://garden/emissions/2024-04-08/national_contributions ###################################################################################################################### # Older versions that should be archived once they are not used by any other steps. + # + # Jones et al. (2023) - National contributions to climate change. + # + data://meadow/emissions/2023-11-23/national_contributions: + - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv + - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv + - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv + data://garden/emissions/2023-11-23/national_contributions: + - data://meadow/emissions/2023-11-23/national_contributions + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population + - data://garden/wb/2023-04-30/income_groups + data://grapher/emissions/2023-11-23/national_contributions: + - data://garden/emissions/2023-11-23/national_contributions ###################################################################################################################### diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json new file mode 100644 index 00000000000..5b3ccbfe1df --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json @@ -0,0 +1,227 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antarctica": "Antarctica", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Christmas Island": "Christmas Island", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "EU27": "European Union (27)", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Faeroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Polynesia": "French Polynesia", + "GLOBAL": "World", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macao": "Macao", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Occupied Palestinian Territory": "Palestine", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Korea": "South Korea", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Türkiye": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "USA": "United States", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Viet Nam": "Vietnam", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Kuwaiti Oil Fires": "Kuwaiti Oil Fires", + "Leeward Islands": "Leeward Islands", + "Panama Canal Zone": "Panama Canal Zone", + "Ryukyu Islands": "Ryukyu Islands", + "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla", + "LDC": "Least developed countries (Jones et al.)", + "OECD": "OECD (Jones et al.)" +} diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json new file mode 100644 index 00000000000..f4e1bbdf837 --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json @@ -0,0 +1,9 @@ +[ + "ANNEXI", + "ANNEXII", + "BASIC", + "EIT", + "LMDC", + "NONANNEX", + "Pacific Islands (Palau)" +] diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml new file mode 100644 index 00000000000..8d6fd94bf5e --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml @@ -0,0 +1,428 @@ +definitions: + measured_in_celsius: &measured-in-celsius |- + Measured in °C. + measured_in_tonnes: &measured-in-tonnes |- + Measured in tonnes. + measured_in_tonnes_per_person: &measured-in-tonnes-per-person |- + Measured in tonnes per person. + measured_in_co2_eq: &measured-in-co2-eq |- + Measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + measured_in_co2_eq_per_person: &measured-in-co2-eq-per-person |- + Measured in tonnes per person of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + ghg_emissions: &ghg-emissions |- + [Greenhouse gas emissions](#dod:ghgemissions) are measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + ghg_emissions_per_person: &ghg-emissions-per-person |- + [Greenhouse gas emissions](#dod:ghgemissions) are measured in tonnes per person of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + processing_methane: &processing-methane |- + Methane emissions in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources and 27.2 for agricultural and land use sources. These factors are taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + processing_nitrous_oxide: &processing-nitrous-oxide |- + Nitrous oxide emissions in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273. This factor is taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + processing_greenhouse_gases: &processing-greenhouse-gases |- + Emissions given in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources. These factors are taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + common: + processing_level: major + presentation: + topic_tags: + - CO2 & Greenhouse Gas Emissions + +dataset: + update_period_days: 365 + description: |- + Jones et al. quantify national and regional contributions to the increase of global mean surface temperature over the last few centuries. + +tables: + national_contributions: + variables: + # Emissions of CH4, CO2, N2O in tonnes (as originally given in the data). + annual_emissions_ch4_fossil: + title: Annual methane emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions from fossil fuels and industry + annual_emissions_ch4_land: + title: Annual methane emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions from agriculture and land use + annual_emissions_ch4_total: + title: Annual methane emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions + annual_emissions_co2_fossil: + title: Annual CO₂ emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions from fossil fuels and industry + annual_emissions_co2_land: + title: Annual CO₂ emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions from agriculture and land use + annual_emissions_co2_total: + title: Annual CO₂ emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions + annual_emissions_n2o_fossil: + title: Annual nitrous oxide emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions from fossil fuels and industry + annual_emissions_n2o_land: + title: Annual nitrous oxide emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions from agriculture and land use + annual_emissions_n2o_total: + title: Annual nitrous oxide emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions + # Emissions (calculated by OWID) of CH4, CO2, N2O in tonnes of CO2eq, as well as combined GHG emissions in CO2eq. + annual_emissions_ghg_fossil_co2eq: + title: Annual greenhouse gas emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions from fossil fuels and industry + annual_emissions_ghg_land_co2eq: + title: Annual greenhouse gas emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions from agriculture and land use + annual_emissions_ghg_total_co2eq: + title: Annual greenhouse gas emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions + annual_emissions_ch4_fossil_co2eq: + title: Annual methane emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions from fossil fuels and industry + annual_emissions_ch4_land_co2eq: + title: Annual methane emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions from agriculture and land use + annual_emissions_ch4_total_co2eq: + title: Annual methane emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions + annual_emissions_n2o_fossil_co2eq: + title: Annual nitrous oxide emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions from fossil fuels and industry + annual_emissions_n2o_land_co2eq: + title: Annual nitrous oxide emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions from agriculture and land use + annual_emissions_n2o_total_co2eq: + title: Annual nitrous oxide emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions + # Cumulative emissions of CH4, CO2, N2O and GHG, in tonnes of CO2eq (as originally given in the data). + cumulative_emissions_ghg_fossil: + title: Cumulative greenhouse gas emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions from fossil fuels and industry + cumulative_emissions_ghg_land: + title: Cumulative greenhouse gas emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions from agriculture and land use + cumulative_emissions_ghg_total: + title: Cumulative greenhouse gas emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions + cumulative_emissions_ch4_fossil: + title: Cumulative methane emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions from fossil fuels and industry + cumulative_emissions_ch4_land: + title: Cumulative methane emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions from agriculture and land use + cumulative_emissions_ch4_total: + title: Cumulative methane emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions + cumulative_emissions_co2_fossil: + title: Cumulative CO₂ emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions from fossil fuels and industry + cumulative_emissions_co2_land: + title: Cumulative CO₂ emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions from agriculture and land use + cumulative_emissions_co2_total: + title: Cumulative CO₂ emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions + cumulative_emissions_n2o_fossil: + title: Cumulative nitrous oxide emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions from fossil fuels and industry + cumulative_emissions_n2o_land: + title: Cumulative nitrous oxide emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions from agriculture and land use + cumulative_emissions_n2o_total: + title: Cumulative nitrous oxide emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions + # Temperature response to emissions of CH4, CO2, N2O and GHG, in °C (as originally given in the data). + temperature_response_ghg_fossil: + title: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry + temperature_response_ghg_land: + title: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use + temperature_response_ghg_total: + title: Change in global mean surface temperature caused by greenhouse gas emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions + temperature_response_ch4_fossil: + title: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry + temperature_response_ch4_land: + title: Change in global mean surface temperature caused by methane emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by methane emissions from agriculture and land use + temperature_response_ch4_total: + title: Change in global mean surface temperature caused by methane emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of methane. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by methane emissions + temperature_response_co2_fossil: + title: Change in global mean surface temperature caused by CO₂ emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions from fossil fuels and industry + temperature_response_co2_land: + title: Change in global mean surface temperature caused by CO₂ emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions from agriculture and land use + temperature_response_co2_total: + title: Change in global mean surface temperature caused by CO₂ emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions + temperature_response_n2o_fossil: + title: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry + temperature_response_n2o_land: + title: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use + temperature_response_n2o_total: + title: Change in global mean surface temperature caused by nitrous oxide emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions + # Share of emissions (calculated by OWID), e.g. methane emissions as a percentage of global methane emissions. + # NOTE: Using CO2eq or tonnes of the original gas is irrelevant when calculated as a share of global emissions. + share_of_annual_emissions_ghg_total: + title: Share of global greenhouse gas emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's greenhouse gas emissions." + description_processing: *processing-greenhouse-gases + presentation: + title_public: Share of global greenhouse gas emissions + share_of_annual_emissions_ch4_total: + title: Share of global methane emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's methane emissions." + presentation: + title_public: Share of global methane emissions + share_of_annual_emissions_co2_total: + title: Share of global CO₂ emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's carbon dioxide emissions." + presentation: + title_public: Share of global CO₂ emissions + share_of_annual_emissions_n2o_total: + title: Share of global nitrous oxide emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's nitrous oxide emissions." + presentation: + title_public: Share of global nitrous oxide emissions + # Share of global temperature change caused by greenhouse gas emissions from each country (calculated by OWID). + share_of_temperature_response_ghg_total: + title: Share of contribution to global warming + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's temperature change." + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Share of contribution to global warming + # Per capita emissions (calculated by OWID). + annual_emissions_co2_total_per_capita: + title: Per-capita CO₂ emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes-per-person + presentation: + title_public: Per-capita CO₂ emissions + annual_emissions_ch4_total_co2eq_per_capita: + title: Per-capita methane emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq-per-person + description_processing: *processing-methane + presentation: + title_public: Per-capita methane emissions + annual_emissions_n2o_total_co2eq_per_capita: + title: Per-capita nitrous oxide emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq-per-person + description_processing: *processing-nitrous-oxide + presentation: + title_public: Per-capita nitrous oxide emissions + annual_emissions_ghg_total_co2eq_per_capita: + title: Per-capita greenhouse gas emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions-per-person + description_processing: *processing-greenhouse-gases + presentation: + title_public: Per-capita greenhouse gas emissions diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py new file mode 100644 index 00000000000..6ac00bafe70 --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py @@ -0,0 +1,354 @@ +"""Load a meadow dataset and create a garden dataset.""" + + +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table, Variable +from owid.datautils.dataframes import map_series + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor to change from teragrams to tonnes. +TERAGRAMS_TO_TONNES = 1e6 +# Conversion factor to change from petagrams to tonnes. +PETAGRAMS_TO_TONNES = 1e9 + +# Conversion factors to change from tonnes of gases emitted to tonnes of CO2 equivalents (taken from IPCC AR6). +CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS = 29.8 +CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS = 27.2 +N2O_EMISSIONS_TO_CO2_EQUIVALENTS = 273 + +# Gases and components expected to be in the data, and how to rename them. +GASES_RENAMING = { + "3-GHG": "ghg", + "CH[4]": "ch4", + "CO[2]": "co2", + "N[2]*O": "n2o", +} +COMPONENTS_RENAMING = { + "Fossil": "fossil", + "LULUCF": "land", + "Total": "total", +} + +# Columns for which we will create "share" variables, e.g. the percentage of methane emissions that a country produces +# in a year with respect to the world's methane emissions on the same year. +# NOTE: For this calculation, it doesn't matter if we use the total or the CO2-equivalent emissions. +SHARE_VARIABLES = [ + "annual_emissions_ch4_total", + "annual_emissions_co2_total", + "annual_emissions_n2o_total", + "annual_emissions_ghg_total_co2eq", + "temperature_response_ghg_total", +] + +# Columns for which a per-capita variable will be created. +PER_CAPITA_VARIABLES = [ + "annual_emissions_ch4_total_co2eq", + "annual_emissions_co2_total", + "annual_emissions_n2o_total_co2eq", + "annual_emissions_ghg_total_co2eq", +] + +# Regions to be added by aggregating data from their member countries. +REGIONS = { + # Default continents. + "Africa": {}, + "Asia": {}, + "Europe": {}, + "North America": {}, + "Oceania": {}, + "South America": {}, + # Income groups. + "Low-income countries": {}, + "Upper-middle-income countries": {}, + "Lower-middle-income countries": {}, + "High-income countries": {}, + # Additional composite regions. + "Asia (excl. China and India)": { + "additional_regions": ["Asia"], + "excluded_members": ["China", "India"], + }, + "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, + "Europe (excl. EU-28)": { + "additional_regions": ["Europe"], + "excluded_regions": ["European Union (27)"], + "excluded_members": ["United Kingdom"], + }, + "European Union (28)": { + "additional_regions": ["European Union (27)"], + "additional_members": ["United Kingdom"], + }, + "North America (excl. USA)": { + "additional_regions": ["North America"], + "excluded_members": ["United States"], + }, + # EU27 is already included in the original data. + # "European Union (27)": {}, +} + + +def run_sanity_checks_on_inputs(tb): + # Sanity checks. + error = "Names of gases have changed." + assert set(tb["gas"]) == set(GASES_RENAMING), error + error = "Names of components have changed." + assert set(tb["component"]) == set(COMPONENTS_RENAMING), error + error = "Units have changed." + assert set(tb["unit"]) == set( + ["Tg~CH[4]~year^-1", "Pg~CO[2]~year^-1", "Tg~N[2]*O~year^-1", "Pg~CO[2]*-e[100]", "°C"] + ), error + + +def add_kuwaiti_oil_fires_to_kuwait(tb: Table) -> Table: + tb = tb.copy() + + # NOTE: Use this function before harmonizing country names. Otherwise adapt the following definitions. + kuwait = "Kuwait" + oil_fires = "Kuwaiti Oil Fires" + + # Sanity check. + error = f"'{kuwait}' or '{oil_fires}' not found in the data." + assert kuwait in set(tb["country"]), error + assert oil_fires in set(tb["country"]), error + + # Add the emissions from the Kuwaiti oil fires (in 1991) to Kuwait. + tb_kuwait = tb[tb["country"] == kuwait].drop(columns="country").set_index("year") + tb_oil_fires = tb[tb["country"] == oil_fires].drop(columns="country").fillna(0).set_index(["year"]) + tb_combined = (tb_kuwait + tb_oil_fires).reset_index().assign(**{"country": kuwait}) + + # Replace the original data for Kuwait by the combined data. + tb_updated = pr.concat([tb[tb["country"] != kuwait].reset_index(drop=True), tb_combined], ignore_index=True) + + # Sort conveniently. + tb_updated = tb_updated.sort_values(["country", "year"]).reset_index(drop=True) + + return tb_updated + + +def add_emissions_in_co2_equivalents(tb: Table) -> Table: + # Add columns for fossil/land/total emissions of CH4 in terms of CO2 equivalents. + # NOTE: For methane, we apply different conversion factors for fossil and land-use emissions. + tb["annual_emissions_ch4_fossil_co2eq"] = ( + tb["annual_emissions_ch4_fossil"] * CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS + ) + tb["annual_emissions_ch4_land_co2eq"] = tb["annual_emissions_ch4_land"] * CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS + tb["annual_emissions_ch4_total_co2eq"] = ( + tb["annual_emissions_ch4_fossil_co2eq"] + tb["annual_emissions_ch4_land_co2eq"] + ) + + # Add columns for fossil/land/total emissions of N2O in terms of CO2 equivalents. + # NOTE: For nitrous oxide, we apply the same conversion factors for fossil and land-use emissions. + for component in ["fossil", "land", "total"]: + tb[f"annual_emissions_n2o_{component}_co2eq"] = ( + tb[f"annual_emissions_n2o_{component}"] * N2O_EMISSIONS_TO_CO2_EQUIVALENTS + ) + + # Add columns for fossil/land/total emissions of all GHG in terms of CO2 equivalents. + # NOTE: The file of annual emissions does not include GHG emissions, which is why we need to add them now. + # However, the files of temperature response and cumulative emissions do include GHG emissions. + for component in ["fossil", "land", "total"]: + tb[f"annual_emissions_ghg_{component}_co2eq"] = ( + tb[f"annual_emissions_co2_{component}"] + + tb[f"annual_emissions_ch4_{component}_co2eq"] + + tb[f"annual_emissions_n2o_{component}_co2eq"] + ) + + return tb + + +def add_share_variables(tb: Table) -> Table: + tb = tb.copy() + + # Create "share" variables (percentages with respect to global). + # To do that, first create a separate table for global data, and add it to the main table. + tb_global = tb[tb["country"] == "World"][["year"] + SHARE_VARIABLES].reset_index(drop=True) + + tb = tb.merge(tb_global, on=["year"], how="left", suffixes=("", "_global")) + # For a list of variables, add the percentage with respect to global. + for variable in SHARE_VARIABLES: + new_variable = f"share_of_{variable.replace('_co2eq', '')}" + tb[new_variable] = 100 * tb[variable] / tb[f"{variable}_global"] + + # Drop unnecessary columns for global data. + tb = tb.drop(columns=[column for column in tb.columns if column.endswith("_global")], errors="raise") + + return tb + + +def add_per_capita_variables(tb: Table, ds_population: Dataset) -> Table: + tb = tb.copy() + + # Add population to data. + tb = geo.add_population_to_table( + tb=tb, + ds_population=ds_population, + warn_on_missing_countries=False, + ) + + # Add per-capita variables. + for variable in PER_CAPITA_VARIABLES: + tb[f"{variable}_per_capita"] = tb[variable] / tb["population"] + + # Drop population column. + tb = tb.drop(columns="population", errors="raise") + + return tb + + +def fix_emissions_jump_in_1850(tb: Table) -> Table: + # There is data from 1830 for some variables and from 1850 for others. + # However, when inspecting data between 1830 and 1850 (e.g. annual_emissions_co2_total) there is an abrupt jump + # between 1849 and 1850, which happens for many countries (e.g. Spain, or World). + # This jump seems to be spurious, and therefore we start all time series from 1850. + + # First check that the jump is still in the data. + emissions_before_jump = tb[(tb["country"] == "World") & (tb["year"] == 1849)]["annual_emissions_co2_total"].item() + emissions_after_jump = tb[(tb["country"] == "World") & (tb["year"] == 1850)]["annual_emissions_co2_total"].item() + error = "Spurious jump between 1849 and 1850 is not in the data anymore. Remove this part of the code." + assert emissions_after_jump / emissions_before_jump > 10, error + + # Visually inspect the jump. + # import plotly.express as px + # px.line(tb[tb["country"]=="World"], x="year", y="annual_emissions_co2_total", markers=True) + + # Start all data after the jump. + tb = tb[tb["year"] >= 1850].reset_index(drop=True) + + return tb + + +def run_sanity_checks_on_outputs(tb: Table) -> None: + error = "Share of global emissions cannot be larger than 101%" + assert (tb[[column for column in tb.columns if "share" in column]].max() < 101).all(), error + error = "Share of global emissions was not expected to be smaller than -1%" + # Some countries did contribute negatively to CO2 emissions, however overall the negative contribution is always + # smaller than 1% in absolute value. + assert (tb[[column for column in tb.columns if "share" in column]].min() > -1).all(), error + + # Ensure that no country contributes to emissions more than the entire world. + columns_that_should_be_smaller_than_global = [ + column for column in tb.drop(columns=["country", "year"]).columns if "capita" not in column + ] + tb_global = tb[tb["country"] == "World"].drop(columns="country") + check = pr.merge( + tb[tb["country"] != "World"].reset_index(drop=True), tb_global, on="year", how="left", suffixes=("", "_global") + ) + for column in columns_that_should_be_smaller_than_global: + # It is in principle possible that some region would emit more than the world, if the rest of regions + # were contributing with negative CO2 emissions (e.g. High-income countries in 1854). + # However, the difference should be very small. + error = f"Region contributed to {column} more than the entire world." + assert check[(check[column] - check[f"{column}_global"]) / check[f"{column}_global"] > 0.00001].empty, error + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("national_contributions") + tb = ds_meadow["national_contributions"].reset_index() + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # + # Process data. + # + # Sanity checks. + run_sanity_checks_on_inputs(tb=tb) + + # Rename gases and components. + tb["gas"] = Variable( + map_series( + series=tb["gas"], mapping=GASES_RENAMING, warn_on_missing_mappings=True, warn_on_unused_mappings=True + ) + ).copy_metadata(tb["gas"]) + tb["component"] = Variable( + map_series( + series=tb["component"], + mapping=COMPONENTS_RENAMING, + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + ) + ).copy_metadata(tb["component"]) + + # Convert units from teragrams and petagrams to tonnes. + tb.loc[tb["unit"].str.startswith("Tg"), "data"] *= TERAGRAMS_TO_TONNES + tb.loc[tb["unit"].str.startswith("Pg"), "data"] *= PETAGRAMS_TO_TONNES + + # Transpose data. + tb = tb.pivot( + index=["country", "year"], columns=["file", "gas", "component"], values="data", join_column_levels_with="_" + ) + + # We add the emissions from the Kuwaiti oil fires in 1991 (which are also included as a separate country) as part + # of the emissions of Kuwait. + # This ensures that these emissions will be included in aggregates of regions that include Kuwait. + tb = add_kuwaiti_oil_fires_to_kuwait(tb=tb) + + # Harmonize country names. + tb = geo.harmonize_countries( + tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + ) + + # Replace spurious negative values with zeros (and ensure they are small numbers, within the uncertainty). + columns_that_cannot_be_negative = [column for column in tb.columns if "fossil" in column] + #################################################################################################################### + # TODO: For some reason, cumulative_emissions_ch4_fossil (and therefore cumulative_emissions_ghg_fossil) have + # big negative values. For example for Ireland's value in 2022 is of -2.93e+08! + # I will look into this, but, for now, I'll ignore those negative values (we are not using these indicators in + # any chart). + columns_that_cannot_be_negative = [ + column + for column in columns_that_cannot_be_negative + if column not in ["cumulative_emissions_ch4_fossil", "cumulative_emissions_ghg_fossil"] + ] + #################################################################################################################### + for column in columns_that_cannot_be_negative: + # Ensure all negative values are just numerical noise. + assert (tb[column].fillna(0) >= -2e-4).all() + # Replace those values by zero. + tb[column] = tb[column].clip(lower=0) + + # Add region aggregates. + tb = geo.add_regions_to_table( + tb=tb, ds_regions=ds_regions, ds_income_groups=ds_income_groups, regions=REGIONS, min_num_values_per_year=1 + ) + + # Add columns for emissions in terms of CO2 equivalents. + tb = add_emissions_in_co2_equivalents(tb=tb) + + # Add "share" variables (percentages with respect to global emissions). + tb = add_share_variables(tb=tb) + + # Add per-capita variables. + tb = add_per_capita_variables(tb=tb, ds_population=ds_population) + + # Fix spurious jump in the data in 1850. + tb = fix_emissions_jump_in_1850(tb=tb) + + # Sanity checks. + run_sanity_checks_on_outputs(tb=tb) + + # Set an appropriate index and sort conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/grapher/emissions/2024-04-08/national_contributions.py b/etl/steps/data/grapher/emissions/2024-04-08/national_contributions.py new file mode 100644 index 00000000000..a8bf5f2bebf --- /dev/null +++ b/etl/steps/data/grapher/emissions/2024-04-08/national_contributions.py @@ -0,0 +1,22 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("national_contributions") + tb_garden = ds_garden["national_contributions"] + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_garden], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py b/etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py new file mode 100644 index 00000000000..df58d26b5f6 --- /dev/null +++ b/etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py @@ -0,0 +1,50 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve all snapshots of the dataset. + snap_annual = paths.load_snapshot("national_contributions_annual_emissions.csv") + snap_cumulative = paths.load_snapshot("national_contributions_cumulative_emissions.csv") + snap_temperature = paths.load_snapshot("national_contributions_temperature_response.csv") + + # Load data from snapshots. + tb_annual = snap_annual.read(underscore=True) + tb_cumulative = snap_cumulative.read(underscore=True) + tb_temperature = snap_temperature.read(underscore=True) + + # + # Process data. + # + # Combine all data into one table. + tb = pr.concat( + [ + tb_annual.assign(**{"file": "annual_emissions"}), + tb_cumulative.assign(**{"file": "cumulative_emissions"}), + tb_temperature.assign(**{"file": "temperature_response"}), + ], + ignore_index=True, + short_name=paths.short_name, + ) + + # Rename columns conveniently. + tb = tb.rename(columns={"cntr_name": "country"}, errors="raise") + + # Set an appropriate index and sort conveniently. + tb = tb.format(keys=["country", "year", "file", "gas", "component"]) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/emissions/2024-04-08/national_contributions.py b/snapshots/emissions/2024-04-08/national_contributions.py new file mode 100644 index 00000000000..ed14f53b5ee --- /dev/null +++ b/snapshots/emissions/2024-04-08/national_contributions.py @@ -0,0 +1,108 @@ +"""Script to create a snapshot of dataset National contributions to climate change (Jones et al.). + +NOTE: All metadata fields are automatically updated by this script. However, the dataset description may change a bit +(for example they may cite more recent papers). Visually inspect the dataset description and manually make small +modifications, if needed. + +""" + +from datetime import datetime +from pathlib import Path +from typing import Dict + +import click +import requests +from bs4 import BeautifulSoup + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of data files to snapshot. +DATA_FILES = [ + "annual_emissions.csv", + "cumulative_emissions.csv", + "temperature_response.csv", +] + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +def main(upload: bool) -> None: + for data_file in DATA_FILES: + # Create a new snapshot. + snap = Snapshot(f"emissions/{SNAPSHOT_VERSION}/national_contributions_{data_file}") + + # Get the publication date (it needs to be done only once). + extracted_fields = extract_metadata_from_main_page(snap) + + for field in extracted_fields: + # Replace metadata fields with the new extracted fields. + setattr(snap.metadata.origin, field, extracted_fields[field]) + + # Rewrite metadata to dvc file. + snap.metadata_path.write_text(snap.metadata.to_yaml()) + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +def extract_metadata_from_main_page(snap: Snapshot) -> Dict[str, str]: + """Extract the publication date.""" + # Get the full HTML content of the main page. + response = requests.get(snap.metadata.origin.url_main) # type: ignore + + # The "latest" url redirects to the new record (which we need to extract other fields). + response_final = response.url + + # Parse the HTML content of the main page. + soup = BeautifulSoup(response.content, "html.parser") + + # Extract the publication date, which is given in one of the first sentences as in, e.g. "Published March 19, 2024". + date_published_str = [line.split("Published")[1].strip() for line in soup.text.split("\n") if "Published" in line][ + 0 + ] + + # Convert to ISO format. + date_published = datetime.strptime(date_published_str, "%B %d, %Y").strftime("%Y-%m-%d") + + # Extract the version of the data producer. + version_producer = [line.split("| Version ")[1].strip() for line in soup.text.split("\n") if "| Version " in line][ + 0 + ] + + # The download links have the years hardcoded in the url, so we need to update them. + file_name = snap.metadata.origin.url_download.split("/")[-1] # type: ignore + # Assume that the latest informed year in the data is 2 years before the current version. + file_name_new = file_name.split("-")[0] + "-" + str(int(version_producer.split(".")[0]) - 2) + ".csv" + # Create the new download url (using the new token for the latest version, and the latest year in the file name). + url_download = response_final + "/files/" + file_name_new + + # The full citation is not included in the HTML and is fetched from an API. + response_citation = requests.get( + response_final.replace("records/", "api/records/") + "?style=chicago-fullnote-bibliography", + headers={"Accept": "text/x-bibliography"}, + ) + + # Extract the full citation. + citation_full = response_citation.text + + # Gather all extracted fields. + extracted_fields = { + "date_published": date_published, + "version_producer": version_producer, + "url_download": url_download, + "citation_full": citation_full, + } + + return extracted_fields + + +if __name__ == "__main__": + main() diff --git a/snapshots/emissions/2024-04-08/national_contributions_annual_emissions.csv.dvc b/snapshots/emissions/2024-04-08/national_contributions_annual_emissions.csv.dvc new file mode 100644 index 00000000000..5ee24c0880e --- /dev/null +++ b/snapshots/emissions/2024-04-08/national_contributions_annual_emissions.csv.dvc @@ -0,0 +1,33 @@ +meta: + origin: + producer: Jones et al. + title: National contributions to climate change + description: |- + National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide. + + This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources since 1851. + + National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023). + + National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023). + + A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). + + Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST). + + The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total). + title_snapshot: National contributions to climate change - Annual emissions + citation_full: |- + Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, March 19, 2024. https://doi.org/10.5281/zenodo.10839859. + version_producer: '2024.1' + url_main: https://zenodo.org/records/7636699/latest + url_download: https://zenodo.org/records/10839859/files/EMISSIONS_ANNUAL_1830-2022.csv + date_accessed: '2024-04-08' + date_published: '2024-03-19' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/7636699/latest +outs: + - md5: 9f931081993e0367f14aaeddb338cbcb + size: 26279535 + path: national_contributions_annual_emissions.csv diff --git a/snapshots/emissions/2024-04-08/national_contributions_cumulative_emissions.csv.dvc b/snapshots/emissions/2024-04-08/national_contributions_cumulative_emissions.csv.dvc new file mode 100644 index 00000000000..69c308eb405 --- /dev/null +++ b/snapshots/emissions/2024-04-08/national_contributions_cumulative_emissions.csv.dvc @@ -0,0 +1,33 @@ +meta: + origin: + producer: Jones et al. + title: National contributions to climate change + description: |- + National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide. + + This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources during since 1851. + + National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023). + + National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023). + + A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). + + Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST). + + The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total). + title_snapshot: National contributions to climate change - Cumulative emissions + citation_full: |- + Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, March 19, 2024. https://doi.org/10.5281/zenodo.10839859. + version_producer: '2024.1' + url_main: https://zenodo.org/records/7636699/latest + url_download: https://zenodo.org/records/10839859/files/EMISSIONS_CUMULATIVE_CO2e100_1851-2022.csv + date_accessed: '2024-04-08' + date_published: '2024-03-19' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/7636699/latest +outs: + - md5: f4f7519994d16cee7a791cb9277c0793 + size: 33485575 + path: national_contributions_cumulative_emissions.csv diff --git a/snapshots/emissions/2024-04-08/national_contributions_temperature_response.csv.dvc b/snapshots/emissions/2024-04-08/national_contributions_temperature_response.csv.dvc new file mode 100644 index 00000000000..dab115b1493 --- /dev/null +++ b/snapshots/emissions/2024-04-08/national_contributions_temperature_response.csv.dvc @@ -0,0 +1,33 @@ +meta: + origin: + producer: Jones et al. + title: National contributions to climate change + description: |- + National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide. + + This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources since 1851. + + National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023). + + National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023). + + A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021). + + Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST). + + The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total). + title_snapshot: National contributions to climate change - Temperature response + citation_full: |- + Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, March 19, 2024. https://doi.org/10.5281/zenodo.10839859. + version_producer: '2024.1' + url_main: https://zenodo.org/records/7636699/latest + url_download: https://zenodo.org/records/10839859/files/GMST_response_1851-2022.csv + date_accessed: '2024-04-08' + date_published: '2024-03-19' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/7636699/latest +outs: + - md5: e46a789f557012f78c6fb98a1816a797 + size: 28745402 + path: national_contributions_temperature_response.csv From a4f816a1a4909b14ee3e8ea0ee3700bd53f2c969 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Wed, 10 Apr 2024 12:29:34 +0200 Subject: [PATCH 05/61] :sparkles: Improve datadiff (#2494) * :sparkles: improve datadiff --- apps/backport/backport.py | 4 +- apps/backport/bulk_backport.py | 4 +- apps/metadata_migrate/cli.py | 7 +- etl/compare.py | 14 +- etl/data_helpers/population.py | 2 +- etl/datadiff.py | 335 +++++++++++++++++++------- etl/db.py | 24 +- etl/grapher_helpers.py | 6 +- etl/grapher_import.py | 8 +- etl/variable_mapping_translate.py | 5 +- lib/catalog/owid/catalog/variables.py | 30 ++- lib/repack/owid/repack/__init__.py | 5 +- lib/repack/tests/test_repack.py | 19 ++ tests/data_helpers/test_geo.py | 4 +- tests/test_datadiff.py | 46 ++-- 15 files changed, 368 insertions(+), 145 deletions(-) diff --git a/apps/backport/backport.py b/apps/backport/backport.py index bb0186581f4..b3fbe2c2a65 100644 --- a/apps/backport/backport.py +++ b/apps/backport/backport.py @@ -20,7 +20,7 @@ from etl import config, paths from etl import grapher_model as gm from etl.backport_helpers import GrapherConfig -from etl.db import get_engine +from etl.db import get_engine, read_sql from etl.files import checksum_str from etl.snapshot import Snapshot, SnapshotMeta @@ -346,7 +346,7 @@ def _load_values(engine: Engine, variable_ids: list[int]) -> pd.DataFrame: "entityCode": "entity_code", } ) - vf: pd.DataFrame = pd.read_sql(q, engine, params={"variable_ids": variable_ids}) + vf = read_sql(q, engine, params={"variable_ids": variable_ids}) df = df.merge(vf, on="variable_id") # try converting values to float if possible, this can make the data 50% smaller diff --git a/apps/backport/bulk_backport.py b/apps/backport/bulk_backport.py index c9d61ceacfe..bb10fd859f0 100644 --- a/apps/backport/bulk_backport.py +++ b/apps/backport/bulk_backport.py @@ -9,7 +9,7 @@ from sqlalchemy.engine import Engine from etl import config -from etl.db import get_engine +from etl.db import get_engine, read_sql from etl.snapshot import snapshot_catalog from etl.steps import load_dag @@ -195,7 +195,7 @@ def _active_datasets( limit %(limit)s """ - df = pd.read_sql( + df = read_sql( q, engine, params={ diff --git a/apps/metadata_migrate/cli.py b/apps/metadata_migrate/cli.py index 48201ac4dcc..69aa7abba17 100644 --- a/apps/metadata_migrate/cli.py +++ b/apps/metadata_migrate/cli.py @@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional import click -import pandas as pd import structlog from owid.catalog import Dataset, DatasetMeta, License, Origin, Source, Table from rich import print @@ -16,7 +15,7 @@ from etl import config from etl import grapher_model as gm from etl.command import main as etl_main -from etl.db import get_engine +from etl.db import get_engine, read_sql from etl.metadata_export import merge_or_create_yaml, reorder_fields from etl.paths import BASE_DIR, DAG_FILE, DATA_DIR, STEP_DIR @@ -108,7 +107,7 @@ def cli( select config from charts where slug = '{chart_slug}' """ - df = pd.read_sql(q, engine) + df = read_sql(q, engine) if df.empty: raise ValueError(f"no chart found for slug {chart_slug}") @@ -359,7 +358,7 @@ def _load_grapher_config(engine: Engine, col: str, ds_meta: DatasetMeta) -> Dict d.version = '{ds_meta.version}' and d.shortName = '{ds_meta.short_name}' """ - cf = pd.read_sql(q, engine) + cf = read_sql(q, engine) if len(cf) == 0: log.warning(f"no chart found for variable {col}") return {} diff --git a/etl/compare.py b/etl/compare.py index 527ae17601f..a690224f3b3 100644 --- a/etl/compare.py +++ b/etl/compare.py @@ -17,7 +17,7 @@ from apps.backport.datasync.data_metadata import variable_data_df_from_s3 from etl import tempcompare -from etl.db import get_engine +from etl.db import get_engine, read_sql @click.group(name="compare", cls=RichGroup) @@ -293,11 +293,7 @@ def read_dataset_from_db(env_path: str, namespace: str, version: str, dataset: s WHERE version = %(version)s and namespace = %(namespace)s and shortName = %(dataset)s """ - df = pd.read_sql( - q, - engine, - params={"version": version, "namespace": namespace, "dataset": dataset}, - ) + df = read_sql(q, engine, params={"version": version, "namespace": namespace, "dataset": dataset}) # drop uninteresting columns df = df.drop(["createdByUserId", "dataEditedAt", "metadataEditedAt", "updatedAt"], axis=1) @@ -316,7 +312,7 @@ def read_variables_from_db(env_path: str, namespace: str, version: str, dataset: WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s """ - df = pd.read_sql( + df = read_sql( q, engine, params={"version": version, "namespace": namespace, "dataset": dataset}, @@ -341,7 +337,7 @@ def read_sources_from_db(env_path: str, namespace: str, version: str, dataset: s WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s """ - df = pd.read_sql( + df = read_sql( q, engine, params={"version": version, "namespace": namespace, "dataset": dataset}, @@ -365,7 +361,7 @@ def read_values_from_s3(env_path: str, namespace: str, version: str, dataset: st JOIN datasets as d ON v.datasetId = d.id WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s """ - vf = pd.read_sql( + vf = read_sql( q, engine, params={"version": version, "namespace": namespace, "dataset": dataset}, diff --git a/etl/data_helpers/population.py b/etl/data_helpers/population.py index d768aab7fa9..6b150b4033c 100644 --- a/etl/data_helpers/population.py +++ b/etl/data_helpers/population.py @@ -111,7 +111,7 @@ def add_population( # Build age groups df_pop = [] - pop["age"] = pop["age"].replace({"100+": 100}).astype("uint") + pop["age"] = pop["age"].astype(str).replace({"100+": 100}).astype("uint") for age_group_name, age_ranges in age_group_mapping.items(): if not age_ranges: age_ranges = [None, None] diff --git a/etl/datadiff.py b/etl/datadiff.py index 3fa6dc26d33..f45008a80df 100644 --- a/etl/datadiff.py +++ b/etl/datadiff.py @@ -3,7 +3,7 @@ import re from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import Any, Callable, Dict, Iterable, List, Optional, cast +from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast import numpy as np import pandas as pd @@ -11,7 +11,7 @@ import rich import rich_click as click import structlog -from owid.catalog import Dataset, DatasetMeta, LocalCatalog, RemoteCatalog, Table, find +from owid.catalog import Dataset, DatasetMeta, LocalCatalog, RemoteCatalog, Table, VariableMeta, find from owid.catalog.catalogs import CHANNEL, OWID_CATALOG_URI from rich.console import Console from rich.panel import Panel @@ -111,16 +111,33 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str): for col in ds_b[table_name].columns: self.p(f"\t\t[green]+ Column [b]{col}[/b]") else: - table_a = ds_a[table_name] - table_b = ds_b[table_name] + # get both tables in parallel + with ThreadPoolExecutor() as executor: + future_a = executor.submit(ds_a.__getitem__, table_name) + future_b = executor.submit(ds_b.__getitem__, table_name) + + table_a = future_a.result() + table_b = future_b.result() # set default index for datasets that don't have one if table_a.index.names == [None] and table_b.index.names == [None]: candidates = {"entity", "date", "country", "year"} - new_index = list(candidates & set(table_a.columns) & set(table_b.columns)) - if new_index: - table_a = table_a.set_index(new_index) - table_b = table_b.set_index(new_index) + new_index_cols = list(candidates & set(table_a.columns) & set(table_b.columns)) + if new_index_cols: + table_a = table_a.set_index(new_index_cols) + table_b = table_b.set_index(new_index_cols) + + # if using default index, it is possible that we have non-determinstic order + # try sorting by the first two columns + if ( + table_a.index.names == [None] + and table_b.index.names == [None] + and len(table_a) == len(table_b) + and table_a.index[-1] == len(table_a) - 1 + and len(table_a) <= 1000 + ): + table_a = table_a.sort_values(list(table_a.columns)).reset_index(drop=True) + table_b = table_b.sort_values(list(table_b.columns)).reset_index(drop=True) # indexes differ, reset them to make them somehow comparable if table_a.index.names != table_b.index.names: @@ -131,21 +148,19 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str): # only sort index if different to avoid unnecessary sorting for huge datasets such as ghe if len(table_a) != len(table_b) or not _index_equals(table_a, table_b): - index_diff = True - table_a, table_b, eq_index = _align_tables(table_a, table_b) - - # if only index order has changed, don't report it - if eq_index.all(): - index_diff = False + table_a, table_b, eq_index, new_index, removed_index = _align_tables(table_a, table_b) else: - index_diff = False eq_index = pd.Series(True, index=table_a.index) + new_index = pd.Series(False, index=table_a.index) + removed_index = pd.Series(False, index=table_a.index) # resetting index will make comparison easier - dims = table_a.index.names + dims = [dim for dim in table_a.index.names if dim is not None] table_a: Table = table_a.reset_index() table_b: Table = table_b.reset_index() - eq_index = eq_index.reset_index(drop=True) + eq_index = cast(pd.Series, eq_index.reset_index(drop=True)) + new_index = cast(pd.Series, new_index.reset_index(drop=True)) + removed_index = cast(pd.Series, removed_index.reset_index(drop=True)) # compare table metadata diff = _dict_diff(_table_metadata_dict(table_a), _table_metadata_dict(table_b), tabs=3) @@ -157,8 +172,31 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str): else: self.p(f"\t[white]= Table [b]{table_name}[/b]") + # compare index + if not eq_index.all(): + for dim in dims: + if eq_index.all(): + self.p(f"\t\t[white]= Dim [b]{dim}[/b]") + else: + self.p(f"\t\t[yellow]~ Dim [b]{dim}[/b]") + if self.verbose: + dims_without_dim = [d for d in dims if d != dim] + out = _data_diff( + table_a, + table_b, + dim, + dims_without_dim, + eq_index, + eq_index, + new_index, + removed_index, + tabs=4, + ) + if out: + self.p(out) + # compare columns - all_cols = sorted(set(table_a.columns) | set(table_b.columns)) + all_cols = sorted((set(table_a.columns) | set(table_b.columns)) - set(dims)) for col in all_cols: if self.cols and not re.search(self.cols, col): continue @@ -171,31 +209,33 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str): col_a = table_a[col] col_b = table_b[col] + # metadata diff + meta_diff = _dict_diff( + _column_metadata_dict(col_a.metadata), _column_metadata_dict(col_b.metadata), tabs=4 + ) + # equality on index and series eq_data = series_equals(table_a[col], table_b[col]) - data_diff = (~eq_data).any() - eq = eq_index & eq_data - - col_a_meta = col_a.metadata.to_dict() - col_b_meta = col_b.metadata.to_dict() - meta_diff = _dict_diff(col_a_meta, col_b_meta, tabs=4) - - changed = ( - (["data"] if data_diff else []) - + (["metadata"] if meta_diff else []) - + (["index"] if index_diff else []) - ) + changed = [] + if meta_diff: + changed.append("changed [u]metadata[/u]") + if new_index.any(): + changed.append("new [u]data[/u]") + if (~eq_data[~new_index]).any(): + changed.append("changed [u]data[/u]") if changed: - self.p(f"\t\t[yellow]~ Column [b]{col}[/b] (changed [u]{' & '.join(changed)}[/u])") + self.p(f"\t\t[yellow]~ Column [b]{col}[/b] ({', '.join(changed)})") if self.verbose: if meta_diff: - self.p(_dict_diff(col_a_meta, col_b_meta, tabs=4)) - if data_diff or index_diff: + self.p(meta_diff) + if new_index.any() or removed_index.any() or (~eq_data).any(): if meta_diff: self.p("") - out = _data_diff(table_a, table_b, col, dims, tabs=4, eq=eq) + out = _data_diff( + table_a, table_b, col, dims, eq_data, eq_index, new_index, removed_index, tabs=4 + ) if out: self.p(out) else: @@ -279,6 +319,13 @@ def __getitem__(self, name: str) -> Table: is_flag=True, help="Print code snippet for loading both tables, useful for debugging in notebook", ) +@click.option( + "--workers", + "-w", + type=int, + help="Use multiple threads.", + default=1, +) def cli( path_a: str, path_b: str, @@ -288,11 +335,14 @@ def cli( exclude: Optional[str], verbose: bool, snippet: bool, + workers: int, ) -> None: """Compare all datasets from two catalogs and print out a summary of their differences. Compare all the datasets from catalog in `PATH_A` with all the datasets in catalog `PATH_B`. The catalog paths link to the `data/` folder with all the datasets (it contains a `catalog.meta.json` file) + You can also use a path to a dataset. + Note that you can use the keyword "REMOTE" as the path, if you want to run a comparison with the remote catalog. This tool is useful as a quick way to see what has changed in the catalog and whether our updates don't have any unexpected side effects. @@ -320,14 +370,24 @@ def cli( path_to_ds_a = _load_catalog_datasets(path_a, channel, include, exclude) path_to_ds_b = _load_catalog_datasets(path_b, channel, include, exclude) - # only keep datasets in DAG + # only keep datasets in DAG, unless there's only one dataset selected by precise path dag_steps = {s.split("://")[1] for s in load_dag().keys()} - path_to_ds_a = {k: v for k, v in path_to_ds_a.items() if k in dag_steps} - path_to_ds_b = {k: v for k, v in path_to_ds_b.items() if k in dag_steps} + if len(path_to_ds_a) > 1: + path_to_ds_a = {k: v for k, v in path_to_ds_a.items() if k in dag_steps} + if len(path_to_ds_b) > 1: + path_to_ds_b = {k: v for k, v in path_to_ds_b.items() if k in dag_steps} + + if not path_to_ds_a: + console.print(f"[yellow]❓ No datasets found in {path_a}[/yellow]") + exit(0) + if not path_to_ds_b: + console.print(f"[yellow]❓ No datasets found in {path_b}[/yellow]") + exit(0) any_diff = False any_error = False + matched_datasets = [] for path in sorted(set(path_to_ds_a.keys()) | set(path_to_ds_b.keys())): ds_a = _match_dataset(path_to_ds_a, path) ds_b = _match_dataset(path_to_ds_b, path) @@ -337,27 +397,65 @@ def cli( # to improve performance. Source checksum should be enough continue - lines = [] + matched_datasets.append((ds_a, ds_b)) - def _append_and_print(x): - lines.append(x) - console.print(x) + if workers > 1: + futures = [] - try: - differ = DatasetDiff(ds_a, ds_b, cols=cols, print=_append_and_print, verbose=verbose, snippet=snippet) - differ.summary() - except DatasetError as e: - # soft fail and continue with another dataset - _append_and_print(f"[bold red]⚠ Error: {e}[/bold red]") - continue - except Exception as e: - # soft fail and continue with another dataset - log.error(e, exc_info=True) - any_error = True - continue + with ThreadPoolExecutor(max_workers=workers) as executor: + for ds_a, ds_b in matched_datasets: + + def func(ds_a, ds_b): + lines = [] + differ = DatasetDiff( + ds_a, ds_b, cols=cols, print=lambda x: lines.append(x), verbose=verbose, snippet=snippet + ) + differ.summary() + return lines + + futures.append(executor.submit(func, ds_a, ds_b)) + + for future in futures: + try: + lines = future.result() + except DatasetError as e: + # soft fail and continue with another dataset + lines = [f"[bold red]⚠ Error: {e}[/bold red]"] + except Exception as e: + # soft fail and continue with another dataset + log.error(e, exc_info=True) + any_error = True + lines = [] + continue - if any("~" in line for line in lines if isinstance(line, str)): - any_diff = True + for line in lines: + console.print(line) + + if "~" in line: + any_diff = True + else: + for ds_a, ds_b in matched_datasets: + lines = [] + + def _append_and_print(x): + lines.append(x) + console.print(x) + + try: + differ = DatasetDiff(ds_a, ds_b, cols=cols, print=_append_and_print, verbose=verbose, snippet=snippet) + differ.summary() + except DatasetError as e: + # soft fail and continue with another dataset + _append_and_print(f"[bold red]⚠ Error: {e}[/bold red]") + continue + except Exception as e: + # soft fail and continue with another dataset + log.error(e, exc_info=True) + any_error = True + continue + + if any("~" in line for line in lines if isinstance(line, str)): + any_diff = True console.print() if not path_to_ds_a and not path_to_ds_b: @@ -388,8 +486,8 @@ def _index_equals(table_a: pd.DataFrame, table_b: pd.DataFrame, sample: int = 10 index_a = table_a.index index_b = table_b.index else: - index_a = table_a.sample(sample, random_state=0).index - index_b = table_b.sample(sample, random_state=0).index + index_a = table_a.sample(sample, random_state=0, replace=True).index + index_b = table_b.sample(sample, random_state=0, replace=True).index return index_a.equals(index_b) @@ -413,23 +511,82 @@ def _dict_diff(dict_a: Dict[str, Any], dict_b: Dict[str, Any], tabs: int = 0, ** return "\t" * tabs + "".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip() +def _df_to_str(df: pd.DataFrame, limit: int = 5) -> list[str]: + lines = [] + if len(df) > limit: + df_samp = df.sample(limit, random_state=0).sort_index() + else: + df_samp = df + + for line in df_samp.to_string(index=False).split("\n"): # type: ignore + lines.append(" " + line) + return lines + + def _data_diff( - table_a: Table, table_b: Table, col: str, dims: list[str], tabs: int, eq: Optional[pd.Series] = None + table_a: Table, + table_b: Table, + col: str, + dims: list[str], + eq_data: pd.Series, + eq_index: pd.Series, + new_index: pd.Series, + removed_index: pd.Series, + tabs: int = 0, ) -> str: """Return summary of data differences.""" - if eq is None: - eq = series_equals(table_a[col], table_b[col]) + # eq = eq_data & eq_index + n = (eq_index | new_index).sum() - lines = [ - f"- Changed values: {(~eq).sum()} / {len(eq)} ({(~eq).sum() / len(eq) * 100:.2f}%)", - ] + lines = [] + + cols = [d for d in dims if d is not None] + [col] + + # new values + if new_index.any(): + lines.append( + f"+ New values: {new_index.sum()} / {n} ({new_index.sum() / n * 100:.2f}%)", + ) + lines += _df_to_str(table_b.loc[new_index, cols]) + + # removed values + if removed_index.any(): + lines.append( + f"- Removed values: {removed_index.sum()} / {n} ({removed_index.sum() / n * 100:.2f}%)", + ) + lines += _df_to_str(table_a.loc[removed_index, cols]) + + # changed values + neq = ~eq_data & eq_index + if neq.any(): + lines.append( + f"~ Changed values: {neq.sum()} / {n} ({neq.sum() / n * 100:.2f}%)", + ) + samp_a = table_a.loc[neq, cols] + samp_b = table_b.loc[neq, cols] + both = samp_a.merge(samp_b, on=dims, suffixes=(" -", " +")) + lines += _df_to_str(both) + # add color + lines = ["[violet]" + line for line in lines] + + if not lines: + return "" + else: + # add tabs + return "\t" * tabs + "\n".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip() + + """OLD CODE, PARTS OF IT COULD BE STILL USEFUL # changes in index for dim in dims: if dim is not None: diff_elements = table_a.loc[~eq, dim].dropna().astype(str).sort_values().unique().tolist() detail = f"{len(diff_elements)} affected" if len(diff_elements) > 5 else ", ".join(diff_elements) - lines.append(f"- {dim}: {detail}") + lines.append(f"- Dim `{dim}`: {detail}") + + lines.append( + f"- Changed values: {(~eq).sum()} / {len(eq)} ({(~eq).sum() / len(eq) * 100:.2f}%)", + ) # changes in values if ( @@ -452,15 +609,7 @@ def _data_diff( rel_diff = abs_diff / mean if not pd.isnull(mean) and mean != 0 else np.nan lines.append(f"- Avg. change: {abs_diff:.2f} ({rel_diff:.0%})") - - # add color - lines = ["[violet]" + line for line in lines] - - if not lines: - return "" - else: - # add tabs - return "\t" * tabs + "\n".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip() + """ def _is_datetime(dtype: Any) -> bool: @@ -470,7 +619,7 @@ def _is_datetime(dtype: Any) -> bool: return False -def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Series]: +def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Series, pd.Series, pd.Series]: if not table_a.index.is_unique or not table_b.index.is_unique: raise DatasetError("Index must be unique.") @@ -488,11 +637,14 @@ def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Seri table_b["_x"] = 1 table_a, table_b = table_a.align(table_b, join="outer", copy=False) - eq_index = table_a["_x"].notnull() & table_b["_x"].notnull() + new_index = table_a["_x"].isnull() + removed_index = table_b["_x"].isnull() + + eq_index = ~(new_index | removed_index) table_a.drop(columns="_x", inplace=True) table_b.drop(columns="_x", inplace=True) - return cast(Table, table_a), cast(Table, table_b), eq_index + return cast(Table, table_a), cast(Table, table_b), eq_index, new_index, removed_index def _sort_index(df: Table) -> Table: @@ -554,10 +706,20 @@ def _table_metadata_dict(tab: Table) -> Dict[str, Any]: # for col in tab.columns: # d["columns"][col] = tab[col].metadata.to_dict() + # sort primary key + if "primary_key" in d: + d["primary_key"] = sorted(d["primary_key"]) + del d["dataset"] return d +def _column_metadata_dict(meta: VariableMeta) -> Dict[str, Any]: + d = meta.to_dict() + d.pop("processing_log", None) + return d + + def _dataset_metadata_dict(ds: Dataset) -> Dict[str, Any]: """Extract metadata from Dataset object, prune and and return it as a dictionary""" d = ds.metadata.to_dict() @@ -571,10 +733,21 @@ def _dataset_metadata_dict(ds: Dataset) -> Dict[str, Any]: def _local_catalog_datasets( - catalog_path: str, channels: Iterable[CHANNEL], include: Optional[str], exclude: Optional[str] + catalog_path: Union[str, Path], channels: Iterable[CHANNEL], include: Optional[str], exclude: Optional[str] ) -> Dict[str, Dataset]: """Return a mapping from dataset path to Dataset object of local catalog.""" - lc_a = LocalCatalog(catalog_path, channels=channels) + catalog_path = Path(catalog_path) + catalog_dir = catalog_path + + # it is possible to use subset of a data catalog + while not (catalog_dir / "catalog.meta.json").exists() and catalog_dir != catalog_dir.parent: + catalog_dir = catalog_dir.parent + + if catalog_dir != catalog_path: + assert include is None, "Include pattern is not supported for subset of a catalog" + include = str(catalog_path.relative_to(catalog_dir)) + + lc_a = LocalCatalog(catalog_dir, channels=channels) datasets = [] for chan in lc_a.channels: channel_datasets = list(lc_a.iter_datasets(chan, include=include)) @@ -585,7 +758,7 @@ def _local_catalog_datasets( datasets += channel_datasets # keep only relative path of dataset - mapping = {str(Path(ds.path).relative_to(catalog_path)): ds for ds in datasets} + mapping = {str(Path(ds.path).relative_to(catalog_dir)): ds for ds in datasets} if exclude: re_exclude = re.compile(exclude) @@ -619,10 +792,10 @@ def _remote_catalog_datasets(channels: Iterable[CHANNEL], include: str, exclude: ds_paths = frame["ds_paths"] if include: - ds_paths = ds_paths[ds_paths.str.contains(include)] + ds_paths = ds_paths[ds_paths.str.contains(include, regex=True)] if exclude: - ds_paths = ds_paths[~ds_paths.str.contains(exclude)] + ds_paths = ds_paths[~ds_paths.str.contains(exclude, regex=True)] ds_paths = set(ds_paths) diff --git a/etl/db.py b/etl/db.py index e2a7a9b9fa8..dcd13ba0e3c 100644 --- a/etl/db.py +++ b/etl/db.py @@ -2,7 +2,7 @@ import warnings from collections.abc import Generator from contextlib import contextmanager -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, cast from urllib.parse import quote import MySQLdb @@ -49,10 +49,13 @@ def get_session(**kwargs) -> Session: def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine: cf: Any = dict_to_object(conf) if conf else config - return create_engine( - f"mysql://{cf.DB_USER}:{quote(cf.DB_PASS)}@{cf.DB_HOST}:{cf.DB_PORT}/{cf.DB_NAME}", - pool_size=30, # Increase the pool size to allow higher GRAPHER_WORKERS - max_overflow=30, # Increase the max overflow limit to allow higher GRAPHER_WORKERS + return cast( + Engine, + create_engine( + f"mysql://{cf.DB_USER}:{quote(cf.DB_PASS)}@{cf.DB_HOST}:{cf.DB_PORT}/{cf.DB_NAME}", + pool_size=30, # Increase the pool size to allow higher GRAPHER_WORKERS + max_overflow=30, # Increase the max overflow limit to allow higher GRAPHER_WORKERS + ), ) @@ -459,3 +462,14 @@ def get_info_for_etl_datasets(db_conn: Optional[MySQLdb.Connection] = None) -> p df.loc[df["is_private"], "step"] = df[df["is_private"]]["step"].str.replace("data://", "data-private://") return df + + +def read_sql(sql: str, engine: Optional[Engine] = None, *args, **kwargs) -> pd.DataFrame: + """Wrapper around pd.read_sql that creates a connection and closes it after reading the data. + This adds overhead, so if you need performance, reuse the same connection and cursor. + """ + engine = engine or get_engine() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + with engine.connect() as con: + return pd.read_sql(sql, con.connection, *args, **kwargs) diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py index a6bc869bc35..064ca1401ad 100644 --- a/etl/grapher_helpers.py +++ b/etl/grapher_helpers.py @@ -13,7 +13,7 @@ from owid import catalog from owid.catalog.utils import underscore -from etl.db import get_connection, get_engine +from etl.db import get_connection, read_sql from etl.db_utils import DBUtils from etl.files import checksum_str @@ -303,7 +303,7 @@ def long_to_wide_tables( def _get_entities_from_db(countries: Set[str], by: Literal["name", "code"]) -> Dict[str, int]: q = f"select id as entity_id, {by} from entities where {by} in %(names)s" - df = pd.read_sql(q, get_engine(), params={"names": list(countries)}) + df = read_sql(q, params={"names": list(countries)}) return cast(Dict[str, int], df.set_index(by).entity_id.to_dict()) @@ -498,6 +498,8 @@ def _adapt_table_for_grapher( assert {"year", country_col} <= set(table.columns), f"Table must have columns {country_col} and year." assert "entity_id" not in table.columns, "Table must not have column entity_id." + table[country_col] = table[country_col].astype(str) + # Grapher needs a column entity id, that is constructed based on the unique entity names in the database. table["entity_id"] = country_to_entity_id(table[country_col], create_entities=True) table = table.drop(columns=[country_col]).rename(columns={year_col: "year"}) diff --git a/etl/grapher_import.py b/etl/grapher_import.py index 97533efc383..346b42a5043 100644 --- a/etl/grapher_import.py +++ b/etl/grapher_import.py @@ -213,10 +213,12 @@ def upsert_table( "Tables to be upserted must have no null values. Instead they" f" have:\n{table.loc[table.iloc[:, 0].isnull()]}" ) table = table.reorder_levels(["year", "entity_id"]) - assert table.index.dtypes[0] in gh.INT_TYPES, f"year must be of an integer type but was: {table.index.dtypes[0]}" assert ( - table.index.dtypes[1] in gh.INT_TYPES - ), f"entity_id must be of an integer type but was: {table.index.dtypes[1]}" + table.index.dtypes.iloc[0] in gh.INT_TYPES + ), f"year must be of an integer type but was: {table.index.dtypes.iloc[0]}" + assert ( + table.index.dtypes.iloc[1] in gh.INT_TYPES + ), f"entity_id must be of an integer type but was: {table.index.dtypes.iloc[1]}" utils.validate_underscore(table.metadata.short_name, "Table's short_name") utils.validate_underscore(table.columns[0], "Variable's name") diff --git a/etl/variable_mapping_translate.py b/etl/variable_mapping_translate.py index e7d49632225..69e9a187be2 100644 --- a/etl/variable_mapping_translate.py +++ b/etl/variable_mapping_translate.py @@ -9,6 +9,8 @@ from sqlalchemy import create_engine from sqlalchemy.engine.base import Engine +from etl.db import read_sql + log = structlog.get_logger() @@ -191,8 +193,7 @@ def _run_query_mapping_to_df(sql: Engine, variable_ids: Tuple[str, ...]) -> pd.D left join datasets on variables.datasetId=datasets.id where variables.id in %(variable_ids)s; """ - df: pd.DataFrame = pd.read_sql_query(query, sql, params={"variable_ids": variable_ids}) - return df + return read_sql(query, sql, params={"variable_ids": variable_ids}) def _build_dfs(sql: Engine, mapping: Dict[str, str]) -> Tuple[pd.DataFrame, pd.DataFrame]: diff --git a/lib/catalog/owid/catalog/variables.py b/lib/catalog/owid/catalog/variables.py index 565a9dbae85..addc4d02d01 100644 --- a/lib/catalog/owid/catalog/variables.py +++ b/lib/catalog/owid/catalog/variables.py @@ -367,31 +367,35 @@ def _get_metadata_value_from_variables_if_all_identical( def get_unique_sources_from_variables(variables: List[Variable]) -> List[Source]: # Make a list of all sources of all variables. - sources = sum([variable.metadata.sources for variable in variables], []) - - return pd.unique(sources).tolist() + sources = [] + for variable in variables: + sources += [s for s in variable.metadata.sources if s not in sources] + return sources def get_unique_origins_from_variables(variables: List[Variable]) -> List[Origin]: # Make a list of all origins of all variables. - origins = sum([variable.metadata.origins for variable in variables], []) - - # Get unique array of tuples of origin fields (respecting the order). - return pd.unique(origins).tolist() + origins = [] + for variable in variables: + # Get unique array of tuples of origin fields (respecting the order). + origins += [o for o in variable.metadata.origins if o not in origins] + return origins def get_unique_licenses_from_variables(variables: List[Variable]) -> List[License]: # Make a list of all licenses of all variables. - licenses = sum([variable.metadata.licenses for variable in variables], []) - - return pd.unique(licenses).tolist() + licenses = [] + for variable in variables: + licenses += [license for license in variable.metadata.licenses if license not in licenses] + return licenses def get_unique_description_key_points_from_variables(variables: List[Variable]) -> List[str]: # Make a list of all description key points of all variables. - description_key_points = sum([variable.metadata.description_key for variable in variables], []) - - return pd.unique(description_key_points).tolist() + description_key_points = [] + for variable in variables: + description_key_points += [k for k in variable.metadata.description_key if k not in description_key_points] + return description_key_points def combine_variables_processing_logs(variables: List[Variable]) -> ProcessingLog: diff --git a/lib/repack/owid/repack/__init__.py b/lib/repack/owid/repack/__init__.py index 32fbe1c63b6..33017514990 100644 --- a/lib/repack/owid/repack/__init__.py +++ b/lib/repack/owid/repack/__init__.py @@ -1,3 +1,4 @@ +import datetime as dt from typing import Any, Dict, List, Optional, cast import numpy as np @@ -65,7 +66,7 @@ def repack_series(s: pd.Series) -> pd.Series: for strategy in [to_int, to_float, to_category]: try: return strategy(s) - except (ValueError, TypeError): + except (ValueError, TypeError, OverflowError): continue return s @@ -126,7 +127,7 @@ def to_float(s: pd.Series) -> pd.Series: def to_category(s: pd.Series) -> pd.Series: types = set(s.dropna().apply(type).unique()) - if types.difference({str, type(None)}): + if types.difference({str, np.str_, dt.datetime, dt.date, type(None)}): raise ValueError() return s.astype("category") diff --git a/lib/repack/tests/test_repack.py b/lib/repack/tests/test_repack.py index 4a666a29ce7..e9596d8b05e 100644 --- a/lib/repack/tests/test_repack.py +++ b/lib/repack/tests/test_repack.py @@ -1,3 +1,4 @@ +import datetime as dt from typing import Any import numpy as np @@ -226,3 +227,21 @@ def test_series_eq(): a = pd.Series([1, np.nan], dtype="float64") b = pd.Series([1, np.nan], dtype="float64") assert repack.series_eq(a, b, cast=float) + + +def test_repack_object_np_str(): + s = pd.Series(["a", np.str_("b")], dtype=object) + v = repack.repack_series(s) + assert v.dtype.name == "category" + + +def test_repack_with_inf(): + s = pd.Series([0, np.inf], dtype=object) + v = repack.repack_series(s) + assert v.dtype.name == "float32" + + +def test_repack_with_datetime(): + s = pd.Series([dt.datetime.today(), dt.date.today()], dtype=object) + v = repack.repack_series(s) + assert v.dtype.name == "category" diff --git a/tests/data_helpers/test_geo.py b/tests/data_helpers/test_geo.py index adcf52d5a4a..b763f908018 100644 --- a/tests/data_helpers/test_geo.py +++ b/tests/data_helpers/test_geo.py @@ -652,9 +652,7 @@ def test_replace_region_with_one_mandatory_country_having_nan(self): df_in = self.df_in.copy() # Add NaN value for Country 2 - df_in = df_in.append( - {"country": "Country 2", "year": 2021, "var_01": np.nan, "var_02": np.nan}, ignore_index=True - ) + df_in.loc[len(df_in)] = {"country": "Country 2", "year": 2021, "var_01": np.nan, "var_02": np.nan} df = geo.add_region_aggregates( df=df_in, diff --git a/tests/test_datadiff.py b/tests/test_datadiff.py index d5f370b0083..ffeac301229 100644 --- a/tests/test_datadiff.py +++ b/tests/test_datadiff.py @@ -1,10 +1,10 @@ import pandas as pd from owid.catalog import Dataset, DatasetMeta, Table -from etl.datadiff import DatasetDiff, _data_diff +from etl.datadiff import DatasetDiff -def test_DatasetDiff_summary(tmp_path): +def _create_datasets(tmp_path): (tmp_path / "catalog_a").mkdir() (tmp_path / "catalog_b").mkdir() @@ -16,6 +16,12 @@ def test_DatasetDiff_summary(tmp_path): ds_b = Dataset.create_empty(tmp_path / "catalog_b" / "ds", ds_meta_b) ds_b.metadata.channel = "garden" # type: ignore + return ds_a, ds_b + + +def test_DatasetDiff_summary(tmp_path): + ds_a, ds_b = _create_datasets(tmp_path) + tab_a = Table(pd.DataFrame({"a": [1, 2]}), short_name="tab") tab_a.metadata.description = "tab" @@ -32,21 +38,29 @@ def test_DatasetDiff_summary(tmp_path): assert out == [ "[white]= Dataset [b]garden/n/v/ds[/b]", "\t[yellow]~ Table [b]tab[/b] (changed [u]metadata[/u])", - "\t\t[yellow]~ Column [b]a[/b] (changed [u]data & metadata[/u])", + "\t\t[yellow]~ Column [b]a[/b] (changed [u]metadata[/u], changed [u]data[/u])", "\t\t[green]+ Column [b]b[/b]", ] -def test_data_diff(): - table_a = Table({"country": ["UK", "US"], "a": [1, 2]}) - table_b = Table({"country": ["UK", "US"], "a": [1, 3]}) - out = _data_diff(table_a, table_b, col="a", dims=["country"], tabs=0) - print(out) - assert ( - out - == """ -[violet]- Changed values: 1 / 2 (50.00%) -[violet]- country: US -[violet]- Avg. change: 1.00 (40%) - """.strip() - ) +def test_new_data(tmp_path): + ds_a, ds_b = _create_datasets(tmp_path) + + tab_a = Table({"country": ["UK", "US"], "a": [1, 3]}, short_name="tab") + tab_b = Table({"country": ["UK", "US", "FR"], "a": [1, 2, 3]}, short_name="tab") + + ds_a.add(tab_a) + ds_b.add(tab_b) + + out = [] + differ = DatasetDiff(ds_a, ds_b, print=lambda x: out.append(x), verbose=True) + differ.summary() + + assert out == [ + "[white]= Dataset [b]garden/n/v/ds[/b]", + "\t[white]= Table [b]tab[/b]", + "\t\t[yellow]~ Dim [b]country[/b]", + "\t\t\t\t[violet]+ New values: 1 / 3 (33.33%)\n\t\t\t\t[violet] country\n\t\t\t\t[violet] FR", + "\t\t[yellow]~ Column [b]a[/b] (new [u]data[/u], changed [u]data[/u])", + "\t\t\t\t[violet]+ New values: 1 / 3 (33.33%)\n\t\t\t\t[violet] country a\n\t\t\t\t[violet] FR 3\n\t\t\t\t[violet]~ Changed values: 1 / 3 (33.33%)\n\t\t\t\t[violet] country a - a +\n\t\t\t\t[violet] US 3.0 2", + ] From 0c2c2a9573572cb2f1e7ec10bb2dd8f34ca1c0b3 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Wed, 10 Apr 2024 14:13:57 +0200 Subject: [PATCH 06/61] :tada: owidbot posts results of `etl diff` to pull requests (#2498) * :tada: post datadiff to PR with owidbot --- apps/owidbot/__init__.py | 0 apps/owidbot/etldiff.py | 153 ++++++++++++++++++++++++++++++++++++ etl/config.py | 2 + etl/datadiff.py | 8 +- poetry.lock | 163 ++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 6 files changed, 324 insertions(+), 3 deletions(-) create mode 100644 apps/owidbot/__init__.py create mode 100644 apps/owidbot/etldiff.py diff --git a/apps/owidbot/__init__.py b/apps/owidbot/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py new file mode 100644 index 00000000000..e72652b5fce --- /dev/null +++ b/apps/owidbot/etldiff.py @@ -0,0 +1,153 @@ +import datetime as dt +import subprocess +from typing import Tuple + +import click +import structlog +from github import Auth, Github +from rich import print +from rich.ansi import AnsiDecoder +from rich_click.rich_command import RichCommand + +from etl import config +from etl.paths import BASE_DIR + +log = structlog.get_logger() + + +EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet" + + +@click.command(name="owidbot-etl-diff", cls=RichCommand, help=__doc__) +@click.option( + "--branch", + type=str, +) +@click.option( + "--dry-run/--no-dry-run", + default=False, + type=bool, + help="Print to console, do not post to Github.", +) +def cli( + branch: str, + dry_run: bool, +) -> None: + """Post result of `etl diff` to Github PR. + + Example: + + ``` + $ python apps/owidbot/etldiff.py --branch my-branch + ``` + """ + lines = call_etl_diff() + diff, result = format_etl_diff(lines) + + body = f""" +
+ +etl diff: {result} + +```diff +{diff} +``` + +Automatically updated datasets matching _{EXCLUDE_DATASETS}_ are not included +
+ +_Edited: {dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")} UTC_ + """.strip() + + if dry_run: + print(body) + else: + post_comment_to_pr(branch, body) + + +def post_comment_to_pr(branch_name: str, body: str) -> None: + assert config.OWIDBOT_ACCESS_TOKEN + auth = Auth.Token(config.OWIDBOT_ACCESS_TOKEN) + g = Github(auth=auth) + + repo = g.get_repo("owid/etl") + + # Find pull requests for the branch (assuming you're looking for open PRs) + pulls = repo.get_pulls(state="open", sort="created", head=f"{repo.owner.login}:{branch_name}") + pulls = list(pulls) + + if len(pulls) == 0: + raise AssertionError(f"No open PR found for branch {branch_name}") + elif len(pulls) > 1: + raise AssertionError(f"More than one open PR found for branch {branch_name}") + + pr = pulls[0] + + comments = pr.get_issue_comments() + + owidbot_comments = [comment for comment in comments if comment.user.login == "owidbot"] + + if len(owidbot_comments) == 0: + pr.create_issue_comment(body=body) + elif len(owidbot_comments) == 1: + owidbot_comment = owidbot_comments[0] + owidbot_comment.edit(body=body) + else: + raise AssertionError("More than one owidbot comment found.") + + +def format_etl_diff(lines: list[str]) -> Tuple[str, str]: + new_lines = [] + result = "" + for line in lines: + # extract result + if line and line[0] in ("✅", "❌", "⚠️", "❓"): + result = line + continue + + # skip some lines + if "this may get slow" in line or "comparison with compare" in line: + continue + + if line.strip().startswith("-"): + line = "-" + line[1:] + if line.strip().startswith("+"): + line = "+" + line[1:] + + new_lines.append(line) + + diff = "\n".join(new_lines) + return diff, result + + +def call_etl_diff() -> list[str]: + cmd = [ + "poetry", + "run", + "etl", + "diff", + "REMOTE", + "data/", + "--include", + "garden", + "--exclude", + EXCLUDE_DATASETS, + "--verbose", + "--workers", + "3", + ] + + result = subprocess.Popen(cmd, cwd=BASE_DIR, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = result.communicate() + + stdout = stdout.decode() + stderr = stderr.decode() + + if stderr: + raise Exception(f"Error: {stderr}") + + return [str(line) for line in AnsiDecoder().decode(stdout)] + + +if __name__ == "__main__": + cli() diff --git a/etl/config.py b/etl/config.py index 528ba5adc15..5005ea5cef7 100644 --- a/etl/config.py +++ b/etl/config.py @@ -150,6 +150,8 @@ def variable_metadata_url(variable_id): OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) +OWIDBOT_ACCESS_TOKEN = env.get("OWIDBOT_ACCESS_TOKEN", None) + def enable_bugsnag() -> None: if BUGSNAG_API_KEY: diff --git a/etl/datadiff.py b/etl/datadiff.py index f45008a80df..b7eaa5e97f9 100644 --- a/etl/datadiff.py +++ b/etl/datadiff.py @@ -1,6 +1,7 @@ import difflib import os import re +import traceback from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast @@ -365,7 +366,7 @@ def cli( $ etl diff other-data/ data/ --include maddison ``` """ - console = Console(tab_size=2) + console = Console(tab_size=2, soft_wrap=True) path_to_ds_a = _load_catalog_datasets(path_a, channel, include, exclude) path_to_ds_b = _load_catalog_datasets(path_b, channel, include, exclude) @@ -423,7 +424,7 @@ def func(ds_a, ds_b): lines = [f"[bold red]⚠ Error: {e}[/bold red]"] except Exception as e: # soft fail and continue with another dataset - log.error(e, exc_info=True) + log.error("\n".join(traceback.format_exception(type(e), e, e.__traceback__))) any_error = True lines = [] continue @@ -757,6 +758,9 @@ def _local_catalog_datasets( datasets += channel_datasets + # only compare public datasets + datasets = [ds for ds in datasets if ds.is_public] + # keep only relative path of dataset mapping = {str(Path(ds.path).relative_to(catalog_dir)): ds for ds in datasets} diff --git a/poetry.lock b/poetry.lock index b25db6638b6..7caf0a94fcf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1262,6 +1262,23 @@ files = [ {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, ] +[[package]] +name = "deprecated" +version = "1.2.14" +description = "Python @deprecated decorator to deprecate old python classes, functions or methods." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"}, + {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"}, +] + +[package.dependencies] +wrapt = ">=1.10,<2" + +[package.extras] +dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] + [[package]] name = "distro" version = "1.8.0" @@ -4458,6 +4475,25 @@ PyYAML = ">=3.0" fsspec = ["appdirs (>=1.4.3)", "fsspec (>=2021.07.0)", "funcy (>=1.14)", "tqdm (>=4.0.0)"] tests = ["black (==23.3.0)", "flake8", "flake8-docstrings", "funcy (>=1.14)", "importlib-resources (<6)", "pyinstaller", "pytest (>=4.6.0)", "pytest-mock", "timeout-decorator"] +[[package]] +name = "pygithub" +version = "2.3.0" +description = "Use the full Github API v3" +optional = false +python-versions = ">=3.7" +files = [ + {file = "PyGithub-2.3.0-py3-none-any.whl", hash = "sha256:65b499728be3ce7b0cd2cd760da3b32f0f4d7bc55e5e0677617f90f6564e793e"}, + {file = "PyGithub-2.3.0.tar.gz", hash = "sha256:0148d7347a1cdeed99af905077010aef81a4dad988b0ba51d4108bf66b443f7e"}, +] + +[package.dependencies] +Deprecated = "*" +pyjwt = {version = ">=2.4.0", extras = ["crypto"]} +pynacl = ">=1.4.0" +requests = ">=2.14.0" +typing-extensions = ">=4.0.0" +urllib3 = ">=1.26.0" + [[package]] name = "pygments" version = "2.16.1" @@ -4483,6 +4519,26 @@ files = [ {file = "pyhumps-3.8.0.tar.gz", hash = "sha256:498026258f7ee1a8e447c2e28526c0bea9407f9a59c03260aee4bd6c04d681a3"}, ] +[[package]] +name = "pyjwt" +version = "2.8.0" +description = "JSON Web Token implementation in Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"}, + {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"}, +] + +[package.dependencies] +cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"crypto\""} + +[package.extras] +crypto = ["cryptography (>=3.4.0)"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] +docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] + [[package]] name = "pymdown-extensions" version = "10.3" @@ -4501,6 +4557,32 @@ pyyaml = "*" [package.extras] extra = ["pygments (>=2.12)"] +[[package]] +name = "pynacl" +version = "1.5.0" +description = "Python binding to the Networking and Cryptography (NaCl) library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:401002a4aaa07c9414132aaed7f6836ff98f59277a234704ff66878c2ee4a0d1"}, + {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:52cb72a79269189d4e0dc537556f4740f7f0a9ec41c1322598799b0bdad4ef92"}, + {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a36d4a9dda1f19ce6e03c9a784a2921a4b726b02e1c736600ca9c22029474394"}, + {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0c84947a22519e013607c9be43706dd42513f9e6ae5d39d3613ca1e142fba44d"}, + {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06b8f6fa7f5de8d5d2f7573fe8c863c051225a27b61e6860fd047b1775807858"}, + {file = "PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a422368fc821589c228f4c49438a368831cb5bbc0eab5ebe1d7fac9dded6567b"}, + {file = "PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:61f642bf2378713e2c2e1de73444a3778e5f0a38be6fee0fe532fe30060282ff"}, + {file = "PyNaCl-1.5.0-cp36-abi3-win32.whl", hash = "sha256:e46dae94e34b085175f8abb3b0aaa7da40767865ac82c928eeb9e57e1ea8a543"}, + {file = "PyNaCl-1.5.0-cp36-abi3-win_amd64.whl", hash = "sha256:20f42270d27e1b6a29f54032090b972d97f0a1b0948cc52392041ef7831fee93"}, + {file = "PyNaCl-1.5.0.tar.gz", hash = "sha256:8ac7448f09ab85811607bdd21ec2464495ac8b7c66d146bf545b0f08fb9220ba"}, +] + +[package.dependencies] +cffi = ">=1.4.1" + +[package.extras] +docs = ["sphinx (>=1.6.5)", "sphinx-rtd-theme"] +tests = ["hypothesis (>=3.27.0)", "pytest (>=3.2.1,!=3.3.0)"] + [[package]] name = "pyopenssl" version = "23.2.0" @@ -7388,6 +7470,85 @@ cachetools = "*" pandas = "*" requests = "*" +[[package]] +name = "wrapt" +version = "1.16.0" +description = "Module for decorators, wrappers and monkey patching." +optional = false +python-versions = ">=3.6" +files = [ + {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"}, + {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"}, + {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"}, + {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"}, + {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"}, + {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"}, + {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"}, + {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"}, + {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"}, + {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"}, + {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"}, + {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"}, + {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"}, + {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"}, + {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"}, + {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"}, + {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"}, + {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"}, + {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"}, + {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"}, + {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"}, + {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"}, + {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"}, + {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"}, + {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"}, + {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"}, + {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"}, + {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, +] + [[package]] name = "wsproto" version = "1.2.0" @@ -7445,4 +7606,4 @@ test = ["pytest", "pytest-cov"] [metadata] lock-version = "2.0" python-versions = ">=3.10, <3.12" -content-hash = "19fe7f0b8f32bcf844488bfe75e1f6302a7e4278bb1b461f45bec30efa2efd6a" +content-hash = "6604150041608aef717982c0ecfd530d14a1f5dec4c0d2bb0582124ca90fce20" diff --git a/pyproject.toml b/pyproject.toml index 5431762f90f..6e67f75dced 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,6 +70,7 @@ cdsapi = "^0.6.1" rioxarray = "^0.15.1" tiktoken = "^0.6.0" html2text = "^2020.1.16" +pygithub = "^2.3.0" [tool.poetry.group.api.dependencies] fastapi = "^0.109.0" From 11bf83bd19e14fcba3b153af58a63bc6b1b423c8 Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Wed, 10 Apr 2024 17:20:25 +0200 Subject: [PATCH 07/61] =?UTF-8?q?=F0=9F=90=9B=20set=20first=20year=20to=20?= =?UTF-8?q?baseline=20of=20=E2=80=931=20for=20all=20benchmarks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set the first year's performance to the baseline of –1 for each benchmark. This is to preserve a baseline for –1 for all benchmarks, even when a second, better performance is recorded in a later year. --- .../garden/artificial_intelligence/2024-04-02/dynabench.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py index 208d80198a5..c0d48e24e02 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py @@ -24,6 +24,12 @@ def run(dest_dir: str) -> None: # Selecting the best performance for each benchmark per year tb = tb.groupby(["benchmark", "year"])["performance"].max().reset_index().copy_metadata(from_table=tb) + # Set the first year's performance to the baseline of –1 for each benchmark. + # This is to preserve a baseline for –1 for all benchmarks, + # even when a second, better performance is recorded in a later year. + tb = tb.sort_values(by=["benchmark", "year"]) + tb.loc[tb.groupby("benchmark").head(1).index, "performance"] = -1 + mapping = { "MNIST": "Handwriting recognition", "Switchboard": "Speech recognition", From 08350d2326f8861aa5a4108abcc8ff3230c6a123 Mon Sep 17 00:00:00 2001 From: Edouard Mathieu Date: Wed, 10 Apr 2024 17:31:48 +0200 Subject: [PATCH 08/61] =?UTF-8?q?=E2=9C=A8=20dynabench:=20disable=20downlo?= =?UTF-8?q?ad=20button=20for=20chart?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Given the transformation we apply in 11bf83bd19e14fcba3b153af58a63bc6b1b423c8, Max and I think it's wiser to disable the download feature for this chart, as the downloadable data doesn't reflect the provider's data. --- .../garden/artificial_intelligence/2024-04-02/dynabench.meta.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml index 33ca7d1a1ad..89456c2164a 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml +++ b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml @@ -27,6 +27,7 @@ definitions: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: update_period_days: 365 + non_redistributable: true tables: dynabench: From 2509197f5695b5c674940cff3bdf28f894c9048a Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Wed, 10 Apr 2024 21:05:48 +0200 Subject: [PATCH 09/61] =?UTF-8?q?=F0=9F=90=9D=20Update=20pandas=20to=202.2?= =?UTF-8?q?=20(#2468)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🐝 Update pandas to 2.2 --- etl/config.py | 7 +- .../2023-06-08/yougov_end_of_humanity.py | 1 + .../2023-06-14/ai_national_strategy.py | 4 +- .../2023-06-26/ai_wrp_2021.py | 4 +- .../data/garden/democracy/2024-03-07/bmr.py | 2 +- .../education/2023-07-17/education_lee_lee.py | 18 +- .../garden/education/2023-07-17/shared.py | 8 +- .../2024-02-26/gdp_and_co2_decoupling.py | 1 + .../latest/xm_karlinsky_kobak.py | 4 +- .../data/garden/faostat/2022-05-17/shared.py | 4 +- .../data/garden/faostat/2023-02-22/shared.py | 4 +- .../data/garden/faostat/2023-06-12/shared.py | 4 +- .../faostat/2024-03-14/faostat_metadata.py | 4 +- .../data/garden/faostat/2024-03-14/shared.py | 11 +- .../2023-09-18/colonial_dates_dataset.py | 3 + .../garden/homicide/2023-01-03/who_mort_db.py | 4 +- .../2023-06-14/prevalence_dalys_world.py | 2 +- .../plastic_waste/2023-09-26/geyer_2017.py | 5 +- .../data/garden/tourism/2023-05-05/unwto.py | 6 +- .../2023-11-27/unhlm_commitments.py | 2 +- .../urban_agglomerations_definition_count.py | 4 +- .../2024-01-09/nuclear_weapons_inventories.py | 2 +- .../status_of_world_nuclear_forces.py | 3 +- .../war/2024-01-25/nuclear_weapons_tests.py | 2 +- .../data/garden/wb/2021-07-01/wb_income.ipynb | 205 ++++++++++-- .../who/2023-04-03/flu_vaccine_policy.py | 8 +- etl/steps/data/garden/who/2024-01-03/gho.py | 7 +- .../garden/wvs/2023-06-25/longitudinal_wvs.py | 3 + .../surface_temperature_anomalies.py | 1 - .../2023-12-20/surface_temperature_monthly.py | 1 - .../2023-06-07/monmouth_poll.py | 2 +- .../2023-06-08/yougov_end_of_humanity.py | 6 +- .../2023-06-08/yougov_jobs.py | 2 +- .../climate/2024-01-28/global_sea_level.py | 2 +- .../health/2023-05-04/global_wellbeing.py | 8 +- .../oecd/2023-05-18/co2_air_transport.py | 2 +- .../data/meadow/tourism/2023-05-05/unwto.py | 5 +- .../tourism/2023-05-09/unwto_environment.py | 1 + lib/catalog/owid/catalog/tables.py | 119 ++++--- lib/catalog/poetry.lock | 117 ++++--- lib/catalog/pyproject.toml | 2 +- lib/catalog/tests/test_tables.py | 14 + lib/datautils/owid/datautils/dataframes.py | 4 +- lib/datautils/poetry.lock | 236 ++++++++------ lib/datautils/pyproject.toml | 4 +- lib/datautils/tests/test_dataframes.py | 12 +- lib/repack/poetry.lock | 153 +++++++-- lib/repack/pyproject.toml | 4 +- lib/walden/poetry.lock | 48 +-- poetry.lock | 307 ++++++++++-------- pyproject.toml | 3 +- 51 files changed, 906 insertions(+), 479 deletions(-) diff --git a/etl/config.py b/etl/config.py index 5005ea5cef7..d73719b985c 100644 --- a/etl/config.py +++ b/etl/config.py @@ -12,6 +12,7 @@ from os import environ as env import bugsnag +import pandas as pd from dotenv import load_dotenv from etl.paths import BASE_DIR @@ -30,6 +31,10 @@ def load_env(): load_env() + + +pd.set_option("future.no_silent_downcasting", True) + # When DEBUG is on # - run steps in the same process (speeding up ETL) DEBUG = env.get("DEBUG") in ("True", "true", "1") @@ -131,7 +136,7 @@ def variable_metadata_url(variable_id): MAX_VIRTUAL_MEMORY_LINUX = 32 * 2**30 # 32 GB # increment this to force a full rebuild of all datasets -ETL_EPOCH = 4 +ETL_EPOCH = 5 # any garden or grapher dataset after this date will have strict mode enabled STRICT_AFTER = "2023-06-25" diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py b/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py index ac765801239..8e4756f09ec 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py @@ -61,6 +61,7 @@ def run(dest_dir: str) -> None: # Transform the 'melted_df_all_age_groups' dataframe into a pivot table with 'options' as index and # each unique value in 'melted_columns' as a column. Store the pivot table in 'pivot_df_all_age_groups'. + melted_df_all_age_groups = melted_df_all_age_groups.astype({"melted_columns": "category"}) pivot_df_all_age_groups = melted_df_all_age_groups.pivot_table( index=["options"], columns="melted_columns", values="value" ) diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py index 9fa62b488c5..ad708f5e3cb 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py @@ -58,10 +58,10 @@ def run(dest_dir: str) -> None: # Check if any year for the current country is not NaN if not group["released_national_strategy_on_ai"].isna().all(): # Forward fill NaN values after "Released" - group["released_national_strategy_on_ai"].fillna(method="ffill", inplace=True) + group["released_national_strategy_on_ai"] = group["released_national_strategy_on_ai"].fillna(method="ffill") # Fill remaining NaN values with "Not Released" - group["released_national_strategy_on_ai"].fillna("Not released", inplace=True) + group["released_national_strategy_on_ai"] = group["released_national_strategy_on_ai"].fillna("Not released") df_merged.loc[group.index] = group df_merged.drop("released", axis=1, inplace=True) tb = Table(df_merged, short_name=paths.short_name, underscore=True) diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py index bfe7cd4e1c7..68e158aa09f 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py @@ -154,7 +154,7 @@ def calculate_percentage(df, column, valid_responses_dict, column_to_split_by): df_filtered = df[[column_to_split_by, "year", column]][valid_responses].reset_index(drop=True) # Group by country and year - grouped = df_filtered.groupby([column_to_split_by, "year"]) + grouped = df_filtered.groupby([column_to_split_by, "year"], observed=True) # Count valid responses counts = grouped[column].value_counts().reset_index(name="count") @@ -343,7 +343,7 @@ def pivot_by_category(df, question): # Iterate over each pivot column for pivot_col in cols_pivot: # Pivot the dataframe for the current pivot column - pivoted_df = pd.pivot_table(df, values=question, index=["country", "year"], columns=pivot_col) + pivoted_df = pd.pivot_table(df, values=question, index=["country", "year"], columns=pivot_col, observed=True) # Append the pivot table to the list pivot_tables.append(pivoted_df) diff --git a/etl/steps/data/garden/democracy/2024-03-07/bmr.py b/etl/steps/data/garden/democracy/2024-03-07/bmr.py index 578cd9c96f8..74aa650c352 100644 --- a/etl/steps/data/garden/democracy/2024-03-07/bmr.py +++ b/etl/steps/data/garden/democracy/2024-03-07/bmr.py @@ -274,7 +274,7 @@ def add_imputes(tb: Table) -> Table: tb = concat(tb_imputed + [tb], ignore_index=True) # Set to False by default (for non-imputed countries) - tb["regime_imputed"] = tb["regime_imputed"].fillna(False) + tb["regime_imputed"] = tb["regime_imputed"].fillna(False).astype(bool) # Re-order columns cols = [ diff --git a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py index 01dd3c7918a..4c73aaa3796 100644 --- a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py +++ b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py @@ -74,13 +74,17 @@ def run(dest_dir: str) -> None: tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) # Replace age group values with descriptive labels - tb["age_group"] = tb["age_group"].replace( - { - "15.0-64.0": "Youth and Adults (15-64 years)", - "15.0-24.0": "Youth (15-24 years)", - "25.0-64.0": "Adults (25-64 years)", - "not specified": "Age not specified", - } + tb["age_group"] = ( + tb["age_group"] + .astype(str) + .replace( + { + "15.0-64.0": "Youth and Adults (15-64 years)", + "15.0-24.0": "Youth (15-24 years)", + "25.0-64.0": "Adults (25-64 years)", + "not specified": "Age not specified", + } + ) ) # Prepare enrollment and attainment data diff --git a/etl/steps/data/garden/education/2023-07-17/shared.py b/etl/steps/data/garden/education/2023-07-17/shared.py index 8db6ff57962..ce0998d945d 100644 --- a/etl/steps/data/garden/education/2023-07-17/shared.py +++ b/etl/steps/data/garden/education/2023-07-17/shared.py @@ -135,7 +135,11 @@ def add_region_aggregates_education( def weighted_mean(x, w): values = np.ma.masked_invalid(x.astype("float64")) weights = np.ma.masked_invalid(w.astype("float64")) - return np.ma.average(values, weights=weights) + out = np.ma.average(values, weights=weights) + if np.ma.is_masked(out): + return np.nan + else: + return out # Create a closure to define variable_agg with specific weights def make_weighted_mean(weights): @@ -149,7 +153,7 @@ def variable_agg(x): else: variable_agg = aggregations[variable] - aggs[variable] = variable_agg + aggs[variable] = variable_agg # type: ignore df_region = groupby_agg( df=df_countries, diff --git a/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py b/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py index 98fd7cb5b14..9f9142fdd66 100644 --- a/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py +++ b/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py @@ -9,6 +9,7 @@ TODO: Include link to the updated static chart once it is created. """ + from structlog import get_logger from etl.helpers import PathFinder, create_dataset diff --git a/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py b/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py index 380500d9c2f..4a940cfef89 100644 --- a/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py +++ b/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py @@ -262,7 +262,9 @@ def add_uk_by_age(df: pd.DataFrame): time_units = df_uk["time_unit"].unique() assert len(time_units) == 1, "There are multiple time units for UK Nations" # Estimate metrics - df_uk = df_uk.groupby(["year", "time", "age"], as_index=False).sum(min_count=3) + df_uk = ( + df_uk.drop(columns=["entity", "time_unit"]).groupby(["year", "time", "age"], as_index=False).sum(min_count=3) + ) # Reassign entity name and time unit df_uk["entity"] = "United Kingdom" df_uk["time_unit"] = time_units[0] diff --git a/etl/steps/data/garden/faostat/2022-05-17/shared.py b/etl/steps/data/garden/faostat/2022-05-17/shared.py index 2422e17b7c4..d7fb893e2cc 100644 --- a/etl/steps/data/garden/faostat/2022-05-17/shared.py +++ b/etl/steps/data/garden/faostat/2022-05-17/shared.py @@ -1366,7 +1366,7 @@ def convert_variables_given_per_capita_to_total_value( # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into # total (non-per-capita) values. element_codes_that_were_per_capita = list( - elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: data = data.copy() @@ -1417,7 +1417,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( - elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: log.info("add_per_capita_variables", shape=data.shape) diff --git a/etl/steps/data/garden/faostat/2023-02-22/shared.py b/etl/steps/data/garden/faostat/2023-02-22/shared.py index 6f18800d737..120f7f476df 100644 --- a/etl/steps/data/garden/faostat/2023-02-22/shared.py +++ b/etl/steps/data/garden/faostat/2023-02-22/shared.py @@ -1304,7 +1304,7 @@ def convert_variables_given_per_capita_to_total_value( # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into # total (non-per-capita) values. element_codes_that_were_per_capita = list( - elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: data = data.copy() @@ -1355,7 +1355,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( - elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: log.info("add_per_capita_variables", shape=data.shape) diff --git a/etl/steps/data/garden/faostat/2023-06-12/shared.py b/etl/steps/data/garden/faostat/2023-06-12/shared.py index 1953069445b..9c6774e9f77 100644 --- a/etl/steps/data/garden/faostat/2023-06-12/shared.py +++ b/etl/steps/data/garden/faostat/2023-06-12/shared.py @@ -1314,7 +1314,7 @@ def convert_variables_given_per_capita_to_total_value( # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into # total (non-per-capita) values. element_codes_that_were_per_capita = list( - elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: data = data.copy() @@ -1365,7 +1365,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( - elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: log.info("add_per_capita_variables", shape=data.shape) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 17514fea34c..03c0c45e48b 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -498,7 +498,9 @@ def create_elements_table_for_domain(table: Table, metadata: Dataset, dataset_sh .sort_values(["fao_unit_short_name"]) .reset_index(drop=True) ) - elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna(elements_from_data["fao_unit_short_name"]) + elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna( + elements_from_data["fao_unit_short_name"].astype(object) + ) # Sanity checks: diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index ec239660fea..9377889c115 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -974,8 +974,11 @@ def remove_overlapping_data_between_historical_regions_and_successors( columns ].drop_duplicates() # Find unique years where the above combinations of item-element-years of region and successors overlap. - overlapping_years = pr.concat([historical_region_years, historical_successors_years], ignore_index=True) - overlapping_years = overlapping_years[overlapping_years.duplicated()] + if historical_region_years.empty and historical_successors_years.empty: + overlapping_years = pd.DataFrame() + else: + overlapping_years = pr.concat([historical_region_years, historical_successors_years], ignore_index=True) + overlapping_years = overlapping_years[overlapping_years.duplicated()] if not overlapping_years.empty: log.warning( f"Removing rows where historical region {historical_region} overlaps with its successors " @@ -1298,7 +1301,7 @@ def convert_variables_given_per_capita_to_total_value(tb: Table, elements_metada # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into # total (non-per-capita) values. element_codes_that_were_per_capita = list( - elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: tb = tb.copy() @@ -1349,7 +1352,7 @@ def add_per_capita_variables(tb: Table, elements_metadata: Table) -> Table: # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( - elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: log.info("add_per_capita_variables", shape=tb_with_pc_variables.shape) diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py index f2b706763db..632347f3ea9 100644 --- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py +++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py @@ -182,6 +182,9 @@ def regional_aggregations(tb: Table, tb_pop: Table) -> Table: # Define non-colonies identifiers for `colonizer` non_colonies = ["zz. Colonizer", "zzz. Not colonized", "zzzz. No longer colonized"] + # Backwards compatibility + tb_regions["colonizer"] = tb_regions["colonizer"].astype(object).fillna(np.nan) + # Define colony_number, which is 1 if countries are not in non_colonies and colony_pop, which is the product of colony and population tb_regions["colony_number"] = tb_regions["colonizer"].apply(lambda x: 0 if x in non_colonies else 1) tb_regions["colony_pop"] = tb_regions["population"] * tb_regions["colony_number"] diff --git a/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py b/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py index 5131c53c201..0e5ed5f2577 100644 --- a/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py +++ b/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py @@ -48,7 +48,7 @@ def run(dest_dir: str) -> None: ds_meadow = Dataset(DATA_DIR / "meadow/homicide/2023-01-03/who_mort_db") tb_meadow = ds_meadow["who_mort_db"] - df = pd.DataFrame(tb_meadow) + df = pd.DataFrame(tb_meadow).astype({"number_of_deaths": float}) log.info("who_mort_db.exclude_countries") df = exclude_countries(df) @@ -92,7 +92,7 @@ def run(dest_dir: str) -> None: def clean_up_dimensions(df: pd.DataFrame) -> pd.DataFrame: sex_dict = {"All": "Both Sexes", "Male": "Males", "Female": "Females", "Unknown": "Unknown sex"} age_dict = {"Age_all": "All ages", "Age_unknown": "Unknown age"} - df = df.replace({"sex": sex_dict, "age_group_code": age_dict}) + df = df.astype({"sex": str, "age_group_code": str}).replace({"sex": sex_dict, "age_group_code": age_dict}) return df diff --git a/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py b/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py index 38473bb5aba..5c3ec2b8b5d 100644 --- a/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py +++ b/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py @@ -102,7 +102,7 @@ def make_table_prevalence(ds: Dataset) -> Table: "share_eating_disorders": "Eating disorders", "share_schizophrenia_disorders": "Schizophrenia", } - tb = tb.rename(columns=column_rename)[set(column_rename.values()) | {"year"}] + tb = tb.rename(columns=column_rename)[list(set(column_rename.values()) | {"year"})] # Unpivot tb = tb.melt(id_vars=["year"], var_name="cause", value_name="share_rate") diff --git a/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py b/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py index 6ef195fa15b..edd8db37ee3 100644 --- a/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py +++ b/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py @@ -1,5 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" import owid.catalog.processing as pr +import pandas as pd from etl.helpers import PathFinder, create_dataset @@ -27,8 +28,8 @@ def run(dest_dir: str) -> None: for year in range(2016, 2019): # 2019 is the stop value and is not included last_value = tb.loc[tb.index[-1], "plastic_production"] # Getting the last value in the 'Value' column new_value = last_value * (1 + growth_rate) # Calculating the value for the new year - new_row = {"country": "World", "year": year, "plastic_production": new_value} # Creating a new row - tb = tb.append(new_row, ignore_index=True) # Adding the new row to the DataFrame + new_row = pd.Series({"country": "World", "year": year, "plastic_production": new_value}) # Creating a new row + tb.loc[len(tb)] = new_row tb["plastic_production"] = tb["plastic_production"] * 1e6 # Convert to millions # Add data from OECD for 2019 diff --git a/etl/steps/data/garden/tourism/2023-05-05/unwto.py b/etl/steps/data/garden/tourism/2023-05-05/unwto.py index 54c1a2b5dbf..36208493c09 100644 --- a/etl/steps/data/garden/tourism/2023-05-05/unwto.py +++ b/etl/steps/data/garden/tourism/2023-05-05/unwto.py @@ -63,14 +63,14 @@ def run(dest_dir: str) -> None: merged_df_drop_ = merged_df.loc[~merged_df.country.isin(["Saba", "Sint Eustatius", "Bonaire"])] # Concatenate 'merged_df_drop_' and 'sum_bon_sint_saba' into a single DataFrame 'merged_df_concat'. # The rows of 'sum_bon_sint_saba' will be appended to 'merged_df_drop_'. - merged_df_concat = merged_df_drop_.append(sum_bon_sint_saba, ignore_index=True) + merged_df_concat = pd.concat([merged_df_drop_, sum_bon_sint_saba], ignore_index=True) # Set index, check that it's unique and reset index - assert not merged_df_concat[["country", "year"]].duplicated().any(), "Index is not well constructed" + assert not merged_df_concat[["country", "year"]].duplicated().any(), "Index is not well constructed" # type: ignore # Aggregate data by region (decided not to do for now) # Africa, Oceania, and income level categories - # regions_ = ["North America", + ## regions_ = ["North America", # "South America", # "Europe", # "Africa", diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py b/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py index 9a2ea51f26d..ae7a4784f82 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py @@ -76,7 +76,7 @@ def add_meaning_to_codes(tb: Table) -> Table: "min_tra_collab", ] - tb[cols_0_1_3] = tb[cols_0_1_3].astype("category").replace({0: "No", 1: "Yes", 3: "Don't know"}) + tb[cols_0_1_3] = tb[cols_0_1_3].astype(object).replace({0: "No", 1: "Yes", 3: "Don't know"}).astype("category") tb[cols_other] = ( tb[cols_other] .astype("object") diff --git a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py index 104a3d08220..66c12efbfd0 100644 --- a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py +++ b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py @@ -35,7 +35,9 @@ def run(dest_dir: str) -> None: # Add a 'year' column filled with 2018 df_counts["year"] = 2018 - df_counts["countries"] = df_counts["countries"].apply(lambda x: f"{x:,} inhabitants" if isinstance(x, int) else x) + df_counts["countries"] = ( + df_counts["countries"].astype(object).apply(lambda x: f"{x:,} inhabitants" if isinstance(x, int) else x) + ) # Replace '' values in the 'countries' column with 'No minimum population threshold' df_counts["countries"] = df_counts["countries"].astype(str).replace("", "No minimum population threshold") diff --git a/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py b/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py index 7f7c870a0aa..c5c662fd39c 100644 --- a/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py +++ b/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py @@ -29,7 +29,7 @@ def run(dest_dir: str) -> None: tb = tb.rename(columns=COLUMNS, errors="raise") # Looking at the original dashboards, it seems that missing values are shown as zeros. - tb = tb.fillna(0) + tb["number_of_warheads"] = tb["number_of_warheads"].fillna(0) # Harmonize country names. tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) diff --git a/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py b/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py index 671d20ecf9d..a5a5bcd91d0 100644 --- a/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py +++ b/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py @@ -38,7 +38,8 @@ def run(dest_dir: str) -> None: # Looking at the original dashboard, it seems that missing values are shown as zeros. # https://public.tableau.com/app/profile/kate.kohn/viz/EstimatedGlobalNuclearWarheadInventories2021/Dashboard1 - tb = tb.fillna(0) + cols = [c for c in tb.columns if c not in ["country", "year"]] + tb[cols] = tb[cols].fillna(0) # Harmonize country names. tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) diff --git a/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py b/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py index 8941f0c9885..554eafe4d68 100644 --- a/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py +++ b/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py @@ -35,7 +35,7 @@ def run(dest_dir: str) -> None: # Process data. # # By looking at the original table, it seems clear that empty cells mean zero. - tb = tb.fillna(0) + tb = tb.astype(object).fillna(0) # Temporarily convert all columns to string (to avoid issues with categorical variables). tb = tb.astype(str) diff --git a/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb b/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb index 2c7a7bc580d..69a65f20899 100644 --- a/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb +++ b/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb @@ -86,10 +86,20 @@ "id": "e001fe46", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/plain": [
-       "PosixPath('/Users/mojmir/projects/etl/data/meadow/wb/2021-07-01/wb_income')"
+       "\u001b[1;35mPosixPath\u001b[0m\u001b[1m(\u001b[0m\u001b[32m'/Users/mojmir/projects/etl2/data/meadow/wb/2021-07-01/wb_income'\u001b[0m\u001b[1m)\u001b[0m"
       ]
      },
      "execution_count": 4,
@@ -103,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 5,
    "id": "134ea32a-77b4-4e4c-af5c-400f6edd5866",
    "metadata": {},
    "outputs": [],
@@ -114,17 +124,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 6,
    "id": "24c738cd",
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/plain": [
-       "['wb_income_group']"
+       "\u001b[1m[\u001b[0m\u001b[32m'wb_income_group'\u001b[0m\u001b[1m]\u001b[0m"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -135,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 7,
    "id": "5553eb58-fd10-4a93-9356-859121b7bed0",
    "metadata": {
     "tags": []
@@ -148,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 8,
    "id": "e9a67fe4-ca1e-4e73-b667-6cef8cc573b2",
    "metadata": {},
    "outputs": [
@@ -162,7 +182,20 @@
     {
      "data": {
       "text/html": [
-       "
\n", + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "             

wb_income_group

\n", + "

table

\n", + "
\n", "