From c51cdd748149e8038468c2827377b5607bdc6ea1 Mon Sep 17 00:00:00 2001
From: Pablo Rosado <pabloarosado@gmail.com>
Date: Tue, 9 Apr 2024 17:00:43 +0200
Subject: [PATCH 01/61] Let ETL write execution times to hidden file, and print
 informative messages (#2503)

* Let ETL write execution times to hidden file, and print informative messages

* Show estimated time also when running etl in dry run mode
---
 .gitignore     |   2 +
 etl/command.py | 118 +++++++++++++++++++++++++++++++++++++++++--------
 etl/paths.py   |   3 ++
 3 files changed, 105 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0e29c71e468..0b877817eda 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,3 +49,5 @@ site/
 .wizardcfg/*
 .streamlit/*
 .ipynb_lock
+.execution_time.json
+
diff --git a/etl/command.py b/etl/command.py
index 62a8f005b17..efc234a57cd 100644
--- a/etl/command.py
+++ b/etl/command.py
@@ -5,13 +5,17 @@
 
 import difflib
 import itertools
+import json
 import re
 import resource
 import sys
 import time
+from collections.abc import MutableMapping
 from concurrent.futures import FIRST_COMPLETED, Future, ProcessPoolExecutor, ThreadPoolExecutor, wait
 from contextlib import contextmanager
+from functools import partial
 from graphlib import TopologicalSorter
+from multiprocessing import Manager
 from os import environ
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set
@@ -343,11 +347,18 @@ def run_dag(
         print("--- All datasets up to date!")
         return
 
+    # Calculate total expected time for all steps (if run sequentially)
+    total_expected_time_seconds = sum(_get_execution_time(str(step)) or 0 for step in steps)
+
     if dry_run:
-        print(f"--- Running {len(steps)} steps:")
+        print(
+            f"--- Would run {len(steps)} steps{_create_expected_time_message(total_expected_time_seconds, prepend_message=' (at least ')}:"
+        )
         return enumerate_steps(steps)
     elif workers == 1:
-        print(f"--- Running {len(steps)} steps:")
+        print(
+            f"--- Running {len(steps)} steps{_create_expected_time_message(total_expected_time_seconds, prepend_message=' (at least ')}:"
+        )
         return exec_steps(steps, strict=strict)
     else:
         print(f"--- Running {len(steps)} steps with {workers} processes:")
@@ -355,14 +366,24 @@ def run_dag(
 
 
 def exec_steps(steps: List[Step], strict: Optional[bool] = None) -> None:
+    execution_times = {}
     for i, step in enumerate(steps, 1):
-        print(f"--- {i}. {step}...")
+        print(f"--- {i}. {step}{_create_expected_time_message(_get_execution_time(step_name=str(step)))}")
+
+        # Determine strictness level for the current step
         strict = _detect_strictness_level(step, strict)
+
         with strictness_level(strict):
+            # Execute the step and measure the time taken
             time_taken = timed_run(lambda: step.run())
-            click.echo(f"{click.style('OK', fg='blue')} ({time_taken:.1f}s)")
+            execution_times[str(step)] = time_taken
+
+            click.echo(f"{click.style('OK', fg='blue')}{_create_expected_time_message(time_taken)}")
             print()
 
+        # Write the recorded execution times to the file after all steps have been executed
+        _write_execution_times(execution_times)
+
 
 def _steps_sort_key(step: Step) -> int:
     """Sort steps by channel, so that grapher steps are executed first, then garden, then meadow, then snapshots."""
@@ -384,16 +405,27 @@ def exec_steps_parallel(steps: List[Step], workers: int, dag: DAG, strict: Optio
     # the load on MySQL
     steps = sorted(steps, key=_steps_sort_key)
 
-    # create execution graph from steps
-    exec_graph = {}
-    steps_str = {str(step) for step in steps}
-    for step in steps:
-        # only add dependencies that are in the list of steps (i.e. are dirty)
-        # NOTE: we have to compare their string versions, the actual objects might have
-        # different attributes
-        exec_graph[str(step)] = {str(dep) for dep in step.dependencies if str(dep) in steps_str}
+    # Use a Manager dict to collect execution times in parallel execution
+    with Manager() as manager:
+        execution_times = manager.dict()
+
+        # Create execution graph from steps
+        exec_graph = {}
+        steps_str = {str(step) for step in steps}
+        for step in steps:
+            # only add dependencies that are in the list of steps (i.e. are dirty)
+            # NOTE: we have to compare their string versions, the actual objects might have
+            # different attributes
+            exec_graph[str(step)] = {str(dep) for dep in step.dependencies if str(dep) in steps_str}
+
+        # Prepare a function for execution that includes the necessary arguments
+        exec_func = partial(_exec_step_job, execution_times=execution_times, dag=dag, strict=strict)
+
+        # Execute the graph of tasks in parallel
+        exec_graph_parallel(exec_graph, exec_func, workers)
 
-    exec_graph_parallel(exec_graph, _exec_step_job, workers, dag=dag, strict=strict)
+        # After all tasks have completed, write the execution times to the file
+        _write_execution_times(dict(execution_times))
 
 
 def exec_graph_parallel(
@@ -433,7 +465,24 @@ def exec_graph_parallel(
                 topological_sorter.done(task)
 
 
-def _exec_step_job(step_name: str, dag: Optional[DAG] = None, strict: Optional[bool] = None) -> None:
+def _create_expected_time_message(
+    expected_time: Optional[float], prepend_message: str = " (", append_message: str = ")"
+) -> str:
+    minutes, seconds = divmod(expected_time or 0, 60)
+    if minutes < 1:
+        partial_message = f"{seconds:.1f}s"
+    else:
+        partial_message = f"{int(minutes)}m{seconds: .1f}s"
+
+    if (expected_time is None) or (expected_time == 0):
+        return ""
+    else:
+        return prepend_message + partial_message + append_message
+
+
+def _exec_step_job(
+    step_name: str, execution_times: MutableMapping, dag: Optional[DAG] = None, strict: Optional[bool] = None
+) -> None:
     """
     Executes a step.
 
@@ -441,19 +490,52 @@ def _exec_step_job(step_name: str, dag: Optional[DAG] = None, strict: Optional[b
     :param dag: The original DAG used to create Step object. This must be the same DAG as given to ETL.
     :param strict: The strictness level for the step execution.
     """
-    print(f"--- Starting {step_name}", flush=True)
+    print(f"--- Starting {step_name}{_create_expected_time_message(_get_execution_time(step_name))}")
     assert dag
     step = parse_step(step_name, dag)
     strict = _detect_strictness_level(step, strict)
     with strictness_level(strict):
-        time_taken = timed_run(lambda: step.run())
+        execution_times[step_name] = timed_run(lambda: step.run())
+    print(f"--- Finished {step_name} ({execution_times[step_name]:.1f}s)")
+
+
+def _write_execution_times(execution_times: Dict) -> None:
+    # Write the recorded execution times to a hidden json file that contains the time it took to execute each step
+    execution_time_file = paths.EXECUTION_TIME_FILE
+    if execution_time_file.exists():
+        with open(execution_time_file, "r") as file:
+            stored_times = json.load(file)
+    else:
+        stored_times = {}
 
-    print(f"--- Finished {step_name} ({time_taken:.0f}s)", flush=True)
+    stored_times.update(execution_times)
+    with open(execution_time_file, "w") as file:
+        json.dump(stored_times, file, indent=4, sort_keys=True)
+
+
+def _get_step_identifier(step_name: str) -> str:
+    return step_name.replace(step_name.split("/")[-2] + "/", "")
+
+
+def _get_execution_time(step_name: str) -> Optional[float]:
+    # Read execution time of a given step from the hidden json file
+    # If it doesn't exist, try to read another version of the same step, and if no other version exists, return None
+    if not paths.EXECUTION_TIME_FILE.exists():
+        return None
+    else:
+        with open(paths.EXECUTION_TIME_FILE, "r") as file:
+            execution_times = json.load(file)
+        execution_time = execution_times.get(step_name)
+        if not execution_time:
+            # If the step has not been timed yet, try to find a previous version
+            step_identifiers = {_get_step_identifier(step): value for step, value in execution_times.items()}
+            execution_time = step_identifiers.get(_get_step_identifier(step_name))
+        return execution_time
 
 
 def enumerate_steps(steps: List[Step]) -> None:
     for i, step in enumerate(steps, 1):
-        print(f"{i}. {step}")
+        print(f"{i}. {step}{_create_expected_time_message(_get_execution_time(str(step)))}")
 
 
 def _detect_strictness_level(step: Step, strict: Optional[bool] = None) -> bool:
diff --git a/etl/paths.py b/etl/paths.py
index 30e465b6feb..a3fa8889535 100644
--- a/etl/paths.py
+++ b/etl/paths.py
@@ -62,3 +62,6 @@
 # Use paths.DAG_ARCHIVE_FILE to load the complete dag, with active and archive steps.
 # Otherwise use paths.DAG_FILE to load only active steps, ignoring archive ones.
 DEFAULT_DAG_FILE = DAG_FILE
+
+# Hidden ETL file that will keep the time it took to execute each step.
+EXECUTION_TIME_FILE = BASE_DIR / ".execution_time.json"

From 4e612809e48f5f88aeb9d663771000cabad98c11 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 10 Apr 2024 04:03:34 +0000
Subject: [PATCH 02/61] :robot: automatic wildfires update

---
 snapshots/climate/latest/weekly_wildfires.csv.dvc     | 11 +++++------
 snapshots/excess_mortality/latest/hmd_stmf.csv.dvc    |  2 +-
 snapshots/excess_mortality/latest/wmd.csv.dvc         |  2 +-
 .../latest/xm_karlinsky_kobak.csv.dvc                 |  2 +-
 .../latest/xm_karlinsky_kobak_ages.csv.dvc            |  2 +-
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
index 2f35edebadf..45fb735c98e 100644
--- a/snapshots/climate/latest/weekly_wildfires.csv.dvc
+++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -5,17 +5,16 @@ meta:
     description: |-
       The dataset provides a weekly comprehensive overview of fire activity and its environmental impact, incorporating data from the Global Wildfire Information System (GWIS) and satellite imagery from MODIS and VIIRS. It includes metrics such as the area of land burnt, cumulative burnt areas, carbon dioxide emissions from fires, cumulative carbon emissions, the number of fires, and cumulative fire counts.
     title_snapshot: Seasonal wildfire trends (2024 and later)
-    description_snapshot: This dataset focuses specifically on older data. A separate snapshot will be created to add more
-      recent data.
+    description_snapshot: This dataset focuses specifically on older data. A separate snapshot will be created to add more recent data.
     citation_full: Global Wildfire Information System
     attribution_short: GWIS
     url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
-    date_accessed: 2024-04-09
-    date_published: 2024-04-09
+    date_accessed: 2024-04-10
+    date_published: 2024-04-10
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
 outs:
-  - md5: 65a4703accc44038d0f82b83879b006f
-    size: 11611371
+  - md5: 1bc963ac2662d95647d5d69942a1d416
+    size: 11623135
     path: weekly_wildfires.csv
diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
index aced7229bef..070ccb99ce4 100644
--- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
+++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,7 +13,7 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-04-09
+    date_accessed: 2024-04-10
     publication_date: 2024-03-18
     publication_year: 2024
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
index b697b7fa6d3..8f957225ab9 100644
--- a/snapshots/excess_mortality/latest/wmd.csv.dvc
+++ b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-04-09
+    date_accessed: 2024-04-10
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
index 44b27194cad..238de18b5f5 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-04-09
+    date_accessed: 2024-04-10
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
index 72e71085f99..9d1cd01c148 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-04-09
+    date_accessed: 2024-04-10
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

From c2a05f92259fd058eb4f1f5ba1a9bf440e20b72f Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 10 Apr 2024 04:05:46 +0000
Subject: [PATCH 03/61] :robot: automatic flunet update

---
 snapshots/who/latest/fluid.csv.dvc  | 4 ++--
 snapshots/who/latest/flunet.csv.dvc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
index 4dad17eec08..d4df3f712c6 100644
--- a/snapshots/who/latest/fluid.csv.dvc
+++ b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 7ea6a347dd2cfff19b73c86c94685cec
-    size: 150179792
+  - md5: c871c20f9342720af8d2634b4641d004
+    size: 150197770
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
index 164d270369f..c5c0f09bca9 100644
--- a/snapshots/who/latest/flunet.csv.dvc
+++ b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: b4ddf1f92ee41abb6c060264b93bc487
-    size: 25727681
+  - md5: 61a80a627866aec81a5fd99e8f169041
+    size: 25729116
     path: flunet.csv

From ae097eebdfe8c044ff53cc535cf12398e0059597 Mon Sep 17 00:00:00 2001
From: Pablo Rosado <pabloarosado@gmail.com>
Date: Wed, 10 Apr 2024 12:28:12 +0200
Subject: [PATCH 04/61] =?UTF-8?q?=F0=9F=93=8A=20Update=20dataset=20on=20na?=
 =?UTF-8?q?tional=20contributions=20to=20climate=20change=20reference=20br?=
 =?UTF-8?q?anch=20(#2501)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Duplicate code from previous version of national contributions to global warming dataset

* Update dataset on national contributions to climate change (#2496)

* Adapt snapshots, meadow, garden and grapher steps

* Fix spurious negative numbers

* Improve format

* Various small improvements, following Pablo A suggestions
---
 dag/emissions.yml                             |  42 +-
 .../national_contributions.countries.json     | 227 ++++++++++
 ...onal_contributions.excluded_countries.json |   9 +
 .../national_contributions.meta.yml           | 428 ++++++++++++++++++
 .../2024-04-08/national_contributions.py      | 354 +++++++++++++++
 .../2024-04-08/national_contributions.py      |  22 +
 .../2024-04-08/national_contributions.py      |  50 ++
 .../2024-04-08/national_contributions.py      | 108 +++++
 ...nal_contributions_annual_emissions.csv.dvc |  33 ++
 ...contributions_cumulative_emissions.csv.dvc |  33 ++
 ...contributions_temperature_response.csv.dvc |  33 ++
 11 files changed, 1325 insertions(+), 14 deletions(-)
 create mode 100644 etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json
 create mode 100644 etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json
 create mode 100644 etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml
 create mode 100644 etl/steps/data/garden/emissions/2024-04-08/national_contributions.py
 create mode 100644 etl/steps/data/grapher/emissions/2024-04-08/national_contributions.py
 create mode 100644 etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py
 create mode 100644 snapshots/emissions/2024-04-08/national_contributions.py
 create mode 100644 snapshots/emissions/2024-04-08/national_contributions_annual_emissions.csv.dvc
 create mode 100644 snapshots/emissions/2024-04-08/national_contributions_cumulative_emissions.csv.dvc
 create mode 100644 snapshots/emissions/2024-04-08/national_contributions_temperature_response.csv.dvc

diff --git a/dag/emissions.yml b/dag/emissions.yml
index 623547c5067..5849d3f8642 100644
--- a/dag/emissions.yml
+++ b/dag/emissions.yml
@@ -76,20 +76,6 @@ steps:
   data://grapher/rff/2023-10-19/emissions_weighted_carbon_price:
     - data://garden/rff/2023-10-19/emissions_weighted_carbon_price
   #
-  # Jones et al. (2023) - National contributions to climate change.
-  #
-  data://meadow/emissions/2023-11-23/national_contributions:
-    - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv
-    - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv
-    - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv
-  data://garden/emissions/2023-11-23/national_contributions:
-    - data://meadow/emissions/2023-11-23/national_contributions
-    - data://garden/regions/2023-01-01/regions
-    - data://garden/demography/2023-03-31/population
-    - data://garden/wb/2023-04-30/income_groups
-  data://grapher/emissions/2023-11-23/national_contributions:
-    - data://garden/emissions/2023-11-23/national_contributions
-  #
   # IPCC - Emission Factor Database (2023-10-24).
   #
   data://meadow/emissions/2023-10-24/emission_factors:
@@ -124,9 +110,37 @@ steps:
   data://garden/emissions/2024-02-26/gdp_and_co2_decoupling:
     - data://garden/gcp/2023-12-12/global_carbon_budget
     - data://garden/worldbank_wdi/2023-05-29/wdi
+  #
+  # Jones et al. - National contributions to climate change.
+  #
+  data://meadow/emissions/2024-04-08/national_contributions:
+    - snapshot://emissions/2024-04-08/national_contributions_temperature_response.csv
+    - snapshot://emissions/2024-04-08/national_contributions_cumulative_emissions.csv
+    - snapshot://emissions/2024-04-08/national_contributions_annual_emissions.csv
+  data://garden/emissions/2024-04-08/national_contributions:
+    - data://meadow/emissions/2024-04-08/national_contributions
+    - data://garden/demography/2023-03-31/population
+    - data://garden/wb/2024-03-11/income_groups
+    - data://garden/regions/2023-01-01/regions
+  data://grapher/emissions/2024-04-08/national_contributions:
+    - data://garden/emissions/2024-04-08/national_contributions
 
   ######################################################################################################################
   # Older versions that should be archived once they are not used by any other steps.
+  #
+  # Jones et al. (2023) - National contributions to climate change.
+  #
+  data://meadow/emissions/2023-11-23/national_contributions:
+    - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv
+    - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv
+    - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv
+  data://garden/emissions/2023-11-23/national_contributions:
+    - data://meadow/emissions/2023-11-23/national_contributions
+    - data://garden/regions/2023-01-01/regions
+    - data://garden/demography/2023-03-31/population
+    - data://garden/wb/2023-04-30/income_groups
+  data://grapher/emissions/2023-11-23/national_contributions:
+    - data://garden/emissions/2023-11-23/national_contributions
 
   ######################################################################################################################
 
diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json
new file mode 100644
index 00000000000..5b3ccbfe1df
--- /dev/null
+++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json
@@ -0,0 +1,227 @@
+{
+  "Afghanistan": "Afghanistan",
+  "Albania": "Albania",
+  "Algeria": "Algeria",
+  "Andorra": "Andorra",
+  "Angola": "Angola",
+  "Anguilla": "Anguilla",
+  "Antarctica": "Antarctica",
+  "Antigua and Barbuda": "Antigua and Barbuda",
+  "Argentina": "Argentina",
+  "Armenia": "Armenia",
+  "Aruba": "Aruba",
+  "Australia": "Australia",
+  "Austria": "Austria",
+  "Azerbaijan": "Azerbaijan",
+  "Bahamas": "Bahamas",
+  "Bahrain": "Bahrain",
+  "Bangladesh": "Bangladesh",
+  "Barbados": "Barbados",
+  "Belarus": "Belarus",
+  "Belgium": "Belgium",
+  "Belize": "Belize",
+  "Benin": "Benin",
+  "Bermuda": "Bermuda",
+  "Bhutan": "Bhutan",
+  "Bolivia": "Bolivia",
+  "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba",
+  "Bosnia and Herzegovina": "Bosnia and Herzegovina",
+  "Botswana": "Botswana",
+  "Brazil": "Brazil",
+  "British Virgin Islands": "British Virgin Islands",
+  "Brunei Darussalam": "Brunei",
+  "Bulgaria": "Bulgaria",
+  "Burkina Faso": "Burkina Faso",
+  "Burundi": "Burundi",
+  "Cambodia": "Cambodia",
+  "Cameroon": "Cameroon",
+  "Canada": "Canada",
+  "Cape Verde": "Cape Verde",
+  "Central African Republic": "Central African Republic",
+  "Chad": "Chad",
+  "Chile": "Chile",
+  "China": "China",
+  "Christmas Island": "Christmas Island",
+  "Colombia": "Colombia",
+  "Comoros": "Comoros",
+  "Congo": "Congo",
+  "Cook Islands": "Cook Islands",
+  "Costa Rica": "Costa Rica",
+  "Croatia": "Croatia",
+  "Cuba": "Cuba",
+  "Cura\u00e7ao": "Curacao",
+  "Cyprus": "Cyprus",
+  "Czechia": "Czechia",
+  "C\u00f4te d'Ivoire": "Cote d'Ivoire",
+  "Democratic Republic of the Congo": "Democratic Republic of Congo",
+  "Denmark": "Denmark",
+  "Djibouti": "Djibouti",
+  "Dominica": "Dominica",
+  "Dominican Republic": "Dominican Republic",
+  "EU27": "European Union (27)",
+  "Ecuador": "Ecuador",
+  "Egypt": "Egypt",
+  "El Salvador": "El Salvador",
+  "Equatorial Guinea": "Equatorial Guinea",
+  "Eritrea": "Eritrea",
+  "Estonia": "Estonia",
+  "Ethiopia": "Ethiopia",
+  "Faeroe Islands": "Faroe Islands",
+  "Fiji": "Fiji",
+  "Finland": "Finland",
+  "France": "France",
+  "French Polynesia": "French Polynesia",
+  "GLOBAL": "World",
+  "Gabon": "Gabon",
+  "Gambia": "Gambia",
+  "Georgia": "Georgia",
+  "Germany": "Germany",
+  "Ghana": "Ghana",
+  "Greece": "Greece",
+  "Greenland": "Greenland",
+  "Grenada": "Grenada",
+  "Guatemala": "Guatemala",
+  "Guinea": "Guinea",
+  "Guinea-Bissau": "Guinea-Bissau",
+  "Guyana": "Guyana",
+  "Haiti": "Haiti",
+  "Honduras": "Honduras",
+  "Hong Kong": "Hong Kong",
+  "Hungary": "Hungary",
+  "Iceland": "Iceland",
+  "India": "India",
+  "Indonesia": "Indonesia",
+  "Iran": "Iran",
+  "Iraq": "Iraq",
+  "Ireland": "Ireland",
+  "Israel": "Israel",
+  "Italy": "Italy",
+  "Jamaica": "Jamaica",
+  "Japan": "Japan",
+  "Jordan": "Jordan",
+  "Kazakhstan": "Kazakhstan",
+  "Kenya": "Kenya",
+  "Kiribati": "Kiribati",
+  "Kosovo": "Kosovo",
+  "Kuwait": "Kuwait",
+  "Kyrgyzstan": "Kyrgyzstan",
+  "Laos": "Laos",
+  "Latvia": "Latvia",
+  "Lebanon": "Lebanon",
+  "Lesotho": "Lesotho",
+  "Liberia": "Liberia",
+  "Libya": "Libya",
+  "Liechtenstein": "Liechtenstein",
+  "Lithuania": "Lithuania",
+  "Luxembourg": "Luxembourg",
+  "Macao": "Macao",
+  "Madagascar": "Madagascar",
+  "Malawi": "Malawi",
+  "Malaysia": "Malaysia",
+  "Maldives": "Maldives",
+  "Mali": "Mali",
+  "Malta": "Malta",
+  "Marshall Islands": "Marshall Islands",
+  "Mauritania": "Mauritania",
+  "Mauritius": "Mauritius",
+  "Mexico": "Mexico",
+  "Micronesia (Federated States of)": "Micronesia (country)",
+  "Moldova": "Moldova",
+  "Mongolia": "Mongolia",
+  "Montenegro": "Montenegro",
+  "Montserrat": "Montserrat",
+  "Morocco": "Morocco",
+  "Mozambique": "Mozambique",
+  "Myanmar": "Myanmar",
+  "Namibia": "Namibia",
+  "Nauru": "Nauru",
+  "Nepal": "Nepal",
+  "Netherlands": "Netherlands",
+  "New Caledonia": "New Caledonia",
+  "New Zealand": "New Zealand",
+  "Nicaragua": "Nicaragua",
+  "Niger": "Niger",
+  "Nigeria": "Nigeria",
+  "Niue": "Niue",
+  "North Korea": "North Korea",
+  "North Macedonia": "North Macedonia",
+  "Norway": "Norway",
+  "Occupied Palestinian Territory": "Palestine",
+  "Oman": "Oman",
+  "Pakistan": "Pakistan",
+  "Palau": "Palau",
+  "Panama": "Panama",
+  "Papua New Guinea": "Papua New Guinea",
+  "Paraguay": "Paraguay",
+  "Peru": "Peru",
+  "Philippines": "Philippines",
+  "Poland": "Poland",
+  "Portugal": "Portugal",
+  "Qatar": "Qatar",
+  "Romania": "Romania",
+  "Russia": "Russia",
+  "Rwanda": "Rwanda",
+  "Saint Helena": "Saint Helena",
+  "Saint Kitts and Nevis": "Saint Kitts and Nevis",
+  "Saint Lucia": "Saint Lucia",
+  "Saint Pierre and Miquelon": "Saint Pierre and Miquelon",
+  "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines",
+  "Samoa": "Samoa",
+  "Sao Tome and Principe": "Sao Tome and Principe",
+  "Saudi Arabia": "Saudi Arabia",
+  "Senegal": "Senegal",
+  "Serbia": "Serbia",
+  "Seychelles": "Seychelles",
+  "Sierra Leone": "Sierra Leone",
+  "Singapore": "Singapore",
+  "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)",
+  "Slovakia": "Slovakia",
+  "Slovenia": "Slovenia",
+  "Solomon Islands": "Solomon Islands",
+  "Somalia": "Somalia",
+  "South Africa": "South Africa",
+  "South Korea": "South Korea",
+  "South Sudan": "South Sudan",
+  "Spain": "Spain",
+  "Sri Lanka": "Sri Lanka",
+  "Sudan": "Sudan",
+  "Suriname": "Suriname",
+  "Swaziland": "Eswatini",
+  "Sweden": "Sweden",
+  "Switzerland": "Switzerland",
+  "Syria": "Syria",
+  "Taiwan": "Taiwan",
+  "Tajikistan": "Tajikistan",
+  "Tanzania": "Tanzania",
+  "Thailand": "Thailand",
+  "Timor-Leste": "East Timor",
+  "Togo": "Togo",
+  "Tonga": "Tonga",
+  "Trinidad and Tobago": "Trinidad and Tobago",
+  "Tunisia": "Tunisia",
+  "Türkiye": "Turkey",
+  "Turkmenistan": "Turkmenistan",
+  "Turks and Caicos Islands": "Turks and Caicos Islands",
+  "Tuvalu": "Tuvalu",
+  "USA": "United States",
+  "Uganda": "Uganda",
+  "Ukraine": "Ukraine",
+  "United Arab Emirates": "United Arab Emirates",
+  "United Kingdom": "United Kingdom",
+  "Uruguay": "Uruguay",
+  "Uzbekistan": "Uzbekistan",
+  "Vanuatu": "Vanuatu",
+  "Venezuela": "Venezuela",
+  "Viet Nam": "Vietnam",
+  "Wallis and Futuna Islands": "Wallis and Futuna",
+  "Yemen": "Yemen",
+  "Zambia": "Zambia",
+  "Zimbabwe": "Zimbabwe",
+  "Kuwaiti Oil Fires": "Kuwaiti Oil Fires",
+  "Leeward Islands": "Leeward Islands",
+  "Panama Canal Zone": "Panama Canal Zone",
+  "Ryukyu Islands": "Ryukyu Islands",
+  "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla",
+  "LDC": "Least developed countries (Jones et al.)",
+  "OECD": "OECD (Jones et al.)"
+}
diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json
new file mode 100644
index 00000000000..f4e1bbdf837
--- /dev/null
+++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json
@@ -0,0 +1,9 @@
+[
+    "ANNEXI",
+    "ANNEXII",
+    "BASIC",
+    "EIT",
+    "LMDC",
+    "NONANNEX",
+    "Pacific Islands (Palau)"
+]
diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml
new file mode 100644
index 00000000000..8d6fd94bf5e
--- /dev/null
+++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml
@@ -0,0 +1,428 @@
+definitions:
+  measured_in_celsius: &measured-in-celsius |-
+    Measured in °C.
+  measured_in_tonnes: &measured-in-tonnes |-
+    Measured in tonnes.
+  measured_in_tonnes_per_person: &measured-in-tonnes-per-person |-
+    Measured in tonnes per person.
+  measured_in_co2_eq: &measured-in-co2-eq |-
+    Measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale.
+  measured_in_co2_eq_per_person: &measured-in-co2-eq-per-person |-
+    Measured in tonnes per person of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale.
+  ghg_emissions: &ghg-emissions |-
+    [Greenhouse gas emissions](#dod:ghgemissions) are measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale.
+  ghg_emissions_per_person: &ghg-emissions-per-person |-
+    [Greenhouse gas emissions](#dod:ghgemissions) are measured in tonnes per person of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale.
+  processing_methane: &processing-methane |-
+    Methane emissions in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources and 27.2 for agricultural and land use sources. These factors are taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC).
+  processing_nitrous_oxide: &processing-nitrous-oxide |-
+    Nitrous oxide emissions in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273. This factor is taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC).
+  processing_greenhouse_gases: &processing-greenhouse-gases |-
+    Emissions given in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources. These factors are taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC).
+  common:
+    processing_level: major
+    presentation:
+      topic_tags:
+      - CO2 & Greenhouse Gas Emissions
+
+dataset:
+  update_period_days: 365
+  description: |-
+    Jones et al. quantify national and regional contributions to the increase of global mean surface temperature over the last few centuries.
+
+tables:
+  national_contributions:
+    variables:
+      # Emissions of CH4, CO2, N2O in tonnes (as originally given in the data).
+      annual_emissions_ch4_fossil:
+        title: Annual methane emissions from fossil fuels and industry
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual methane emissions from fossil fuels and industry
+      annual_emissions_ch4_land:
+        title: Annual methane emissions from agriculture and land use
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual methane emissions from agriculture and land use
+      annual_emissions_ch4_total:
+        title: Annual methane emissions
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual methane emissions
+      annual_emissions_co2_fossil:
+        title: Annual CO₂ emissions from fossil fuels and industry
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual CO₂ emissions from fossil fuels and industry
+      annual_emissions_co2_land:
+        title: Annual CO₂ emissions from agriculture and land use
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual CO₂ emissions from agriculture and land use
+      annual_emissions_co2_total:
+        title: Annual CO₂ emissions
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual CO₂ emissions
+      annual_emissions_n2o_fossil:
+        title: Annual nitrous oxide emissions from fossil fuels and industry
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual nitrous oxide emissions from fossil fuels and industry
+      annual_emissions_n2o_land:
+        title: Annual nitrous oxide emissions from agriculture and land use
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual nitrous oxide emissions from agriculture and land use
+      annual_emissions_n2o_total:
+        title: Annual nitrous oxide emissions
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Annual nitrous oxide emissions
+      # Emissions (calculated by OWID) of CH4, CO2, N2O in tonnes of CO2eq, as well as combined GHG emissions in CO2eq.
+      annual_emissions_ghg_fossil_co2eq:
+        title: Annual greenhouse gas emissions from fossil fuels and industry in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *ghg-emissions
+        description_processing: *processing-greenhouse-gases
+        presentation:
+          title_public: Annual greenhouse gas emissions from fossil fuels and industry
+      annual_emissions_ghg_land_co2eq:
+        title: Annual greenhouse gas emissions from agriculture and land use in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *ghg-emissions
+        description_processing: *processing-greenhouse-gases
+        presentation:
+          title_public: Annual greenhouse gas emissions from agriculture and land use
+      annual_emissions_ghg_total_co2eq:
+        title: Annual greenhouse gas emissions in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *ghg-emissions
+        description_processing: *processing-greenhouse-gases
+        presentation:
+          title_public: Annual greenhouse gas emissions
+      annual_emissions_ch4_fossil_co2eq:
+        title: Annual methane emissions from fossil fuels and industry in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        description_processing: *processing-methane
+        presentation:
+          title_public: Annual methane emissions from fossil fuels and industry
+      annual_emissions_ch4_land_co2eq:
+        title: Annual methane emissions from agriculture and land use in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        description_processing: *processing-methane
+        presentation:
+          title_public: Annual methane emissions from agriculture and land use
+      annual_emissions_ch4_total_co2eq:
+        title: Annual methane emissions in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        description_processing: *processing-methane
+        presentation:
+          title_public: Annual methane emissions
+      annual_emissions_n2o_fossil_co2eq:
+        title: Annual nitrous oxide emissions from fossil fuels and industry in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        description_processing: *processing-nitrous-oxide
+        presentation:
+          title_public: Annual nitrous oxide emissions from fossil fuels and industry
+      annual_emissions_n2o_land_co2eq:
+        title: Annual nitrous oxide emissions from agriculture and land use in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        description_processing: *processing-nitrous-oxide
+        presentation:
+          title_public: Annual nitrous oxide emissions from agriculture and land use
+      annual_emissions_n2o_total_co2eq:
+        title: Annual nitrous oxide emissions in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        description_processing: *processing-nitrous-oxide
+        presentation:
+          title_public: Annual nitrous oxide emissions
+      # Cumulative emissions of CH4, CO2, N2O and GHG, in tonnes of CO2eq (as originally given in the data).
+      cumulative_emissions_ghg_fossil:
+        title: Cumulative greenhouse gas emissions from fossil fuels and industry
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *ghg-emissions
+        presentation:
+          title_public: Cumulative greenhouse gas emissions from fossil fuels and industry
+      cumulative_emissions_ghg_land:
+        title: Cumulative greenhouse gas emissions from agriculture and land use
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *ghg-emissions
+        presentation:
+          title_public: Cumulative greenhouse gas emissions from agriculture and land use
+      cumulative_emissions_ghg_total:
+        title: Cumulative greenhouse gas emissions
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *ghg-emissions
+        presentation:
+          title_public: Cumulative greenhouse gas emissions
+      cumulative_emissions_ch4_fossil:
+        title: Cumulative methane emissions from fossil fuels and industry
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        presentation:
+          title_public: Cumulative methane emissions from fossil fuels and industry
+      cumulative_emissions_ch4_land:
+        title: Cumulative methane emissions from agriculture and land use
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        presentation:
+          title_public: Cumulative methane emissions from agriculture and land use
+      cumulative_emissions_ch4_total:
+        title: Cumulative methane emissions
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        presentation:
+          title_public: Cumulative methane emissions
+      cumulative_emissions_co2_fossil:
+        title: Cumulative CO₂ emissions from fossil fuels and industry
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Cumulative CO₂ emissions from fossil fuels and industry
+      cumulative_emissions_co2_land:
+        title: Cumulative CO₂ emissions from agriculture and land use
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Cumulative CO₂ emissions from agriculture and land use
+      cumulative_emissions_co2_total:
+        title: Cumulative CO₂ emissions
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes
+        presentation:
+          title_public: Cumulative CO₂ emissions
+      cumulative_emissions_n2o_fossil:
+        title: Cumulative nitrous oxide emissions from fossil fuels and industry
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        presentation:
+          title_public: Cumulative nitrous oxide emissions from fossil fuels and industry
+      cumulative_emissions_n2o_land:
+        title: Cumulative nitrous oxide emissions from agriculture and land use
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        presentation:
+          title_public: Cumulative nitrous oxide emissions from agriculture and land use
+      cumulative_emissions_n2o_total:
+        title: Cumulative nitrous oxide emissions
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq
+        presentation:
+          title_public: Cumulative nitrous oxide emissions
+      # Temperature response to emissions of CH4, CO2, N2O and GHG, in °C (as originally given in the data).
+      temperature_response_ghg_fossil:
+        title: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        presentation:
+          title_public: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry
+      temperature_response_ghg_land:
+        title: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        presentation:
+          title_public: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use
+      temperature_response_ghg_total:
+        title: Change in global mean surface temperature caused by greenhouse gas emissions
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        description_key:
+          - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide.
+          - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach.
+        presentation:
+          title_public: Change in global mean surface temperature caused by greenhouse gas emissions
+      temperature_response_ch4_fossil:
+        title: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        presentation:
+          title_public: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry
+      temperature_response_ch4_land:
+        title: Change in global mean surface temperature caused by methane emissions from agriculture and land use
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        presentation:
+          title_public: Change in global mean surface temperature caused by methane emissions from agriculture and land use
+      temperature_response_ch4_total:
+        title: Change in global mean surface temperature caused by methane emissions
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        description_key:
+          - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of methane.
+          - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach.
+        presentation:
+          title_public: Change in global mean surface temperature caused by methane emissions
+      temperature_response_co2_fossil:
+        title: Change in global mean surface temperature caused by CO₂ emissions from fossil fuels and industry
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        presentation:
+          title_public: Change in global mean surface temperature caused by CO₂ emissions from fossil fuels and industry
+      temperature_response_co2_land:
+        title: Change in global mean surface temperature caused by CO₂ emissions from agriculture and land use
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        presentation:
+          title_public: Change in global mean surface temperature caused by CO₂ emissions from agriculture and land use
+      temperature_response_co2_total:
+        title: Change in global mean surface temperature caused by CO₂ emissions
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        description_key:
+          - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide.
+          - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach.
+        presentation:
+          title_public: Change in global mean surface temperature caused by CO₂ emissions
+      temperature_response_n2o_fossil:
+        title: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        presentation:
+          title_public: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry
+      temperature_response_n2o_land:
+        title: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        presentation:
+          title_public: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use
+      temperature_response_n2o_total:
+        title: Change in global mean surface temperature caused by nitrous oxide emissions
+        unit: °C
+        short_unit: °C
+        description_short: *measured-in-celsius
+        description_key:
+          - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of nitrous oxide.
+          - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach.
+        presentation:
+          title_public: Change in global mean surface temperature caused by nitrous oxide emissions
+      # Share of emissions (calculated by OWID), e.g. methane emissions as a percentage of global methane emissions.
+      # NOTE: Using CO2eq or tonnes of the original gas is irrelevant when calculated as a share of global emissions.
+      share_of_annual_emissions_ghg_total:
+        title: Share of global greenhouse gas emissions
+        unit: "%"
+        short_unit: "%"
+        description_short: "Measured as a percentage of the world's greenhouse gas emissions."
+        description_processing: *processing-greenhouse-gases
+        presentation:
+          title_public: Share of global greenhouse gas emissions
+      share_of_annual_emissions_ch4_total:
+        title: Share of global methane emissions
+        unit: "%"
+        short_unit: "%"
+        description_short: "Measured as a percentage of the world's methane emissions."
+        presentation:
+          title_public: Share of global methane emissions
+      share_of_annual_emissions_co2_total:
+        title: Share of global CO₂ emissions
+        unit: "%"
+        short_unit: "%"
+        description_short: "Measured as a percentage of the world's carbon dioxide emissions."
+        presentation:
+          title_public: Share of global CO₂ emissions
+      share_of_annual_emissions_n2o_total:
+        title: Share of global nitrous oxide emissions
+        unit: "%"
+        short_unit: "%"
+        description_short: "Measured as a percentage of the world's nitrous oxide emissions."
+        presentation:
+          title_public: Share of global nitrous oxide emissions
+      # Share of global temperature change caused by greenhouse gas emissions from each country (calculated by OWID).
+      share_of_temperature_response_ghg_total:
+        title: Share of contribution to global warming
+        unit: "%"
+        short_unit: "%"
+        description_short: "Measured as a percentage of the world's temperature change."
+        description_key:
+          - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide.
+          - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach.
+        presentation:
+          title_public: Share of contribution to global warming
+      # Per capita emissions (calculated by OWID).
+      annual_emissions_co2_total_per_capita:
+        title: Per-capita CO₂ emissions
+        unit: tonnes
+        short_unit: t
+        description_short: *measured-in-tonnes-per-person
+        presentation:
+          title_public: Per-capita CO₂ emissions
+      annual_emissions_ch4_total_co2eq_per_capita:
+        title: Per-capita methane emissions in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq-per-person
+        description_processing: *processing-methane
+        presentation:
+          title_public: Per-capita methane emissions
+      annual_emissions_n2o_total_co2eq_per_capita:
+        title: Per-capita nitrous oxide emissions in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *measured-in-co2-eq-per-person
+        description_processing: *processing-nitrous-oxide
+        presentation:
+          title_public: Per-capita nitrous oxide emissions
+      annual_emissions_ghg_total_co2eq_per_capita:
+        title: Per-capita greenhouse gas emissions in CO₂ equivalents
+        unit: tonnes of CO₂ equivalents
+        short_unit: t
+        description_short: *ghg-emissions-per-person
+        description_processing: *processing-greenhouse-gases
+        presentation:
+          title_public: Per-capita greenhouse gas emissions
diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py
new file mode 100644
index 00000000000..6ac00bafe70
--- /dev/null
+++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py
@@ -0,0 +1,354 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+
+import owid.catalog.processing as pr
+from owid.catalog import Dataset, Table, Variable
+from owid.datautils.dataframes import map_series
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Conversion factor to change from teragrams to tonnes.
+TERAGRAMS_TO_TONNES = 1e6
+# Conversion factor to change from petagrams to tonnes.
+PETAGRAMS_TO_TONNES = 1e9
+
+# Conversion factors to change from tonnes of gases emitted to tonnes of CO2 equivalents (taken from IPCC AR6).
+CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS = 29.8
+CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS = 27.2
+N2O_EMISSIONS_TO_CO2_EQUIVALENTS = 273
+
+# Gases and components expected to be in the data, and how to rename them.
+GASES_RENAMING = {
+    "3-GHG": "ghg",
+    "CH[4]": "ch4",
+    "CO[2]": "co2",
+    "N[2]*O": "n2o",
+}
+COMPONENTS_RENAMING = {
+    "Fossil": "fossil",
+    "LULUCF": "land",
+    "Total": "total",
+}
+
+# Columns for which we will create "share" variables, e.g. the percentage of methane emissions that a country produces
+# in a year with respect to the world's methane emissions on the same year.
+# NOTE: For this calculation, it doesn't matter if we use the total or the CO2-equivalent emissions.
+SHARE_VARIABLES = [
+    "annual_emissions_ch4_total",
+    "annual_emissions_co2_total",
+    "annual_emissions_n2o_total",
+    "annual_emissions_ghg_total_co2eq",
+    "temperature_response_ghg_total",
+]
+
+# Columns for which a per-capita variable will be created.
+PER_CAPITA_VARIABLES = [
+    "annual_emissions_ch4_total_co2eq",
+    "annual_emissions_co2_total",
+    "annual_emissions_n2o_total_co2eq",
+    "annual_emissions_ghg_total_co2eq",
+]
+
+# Regions to be added by aggregating data from their member countries.
+REGIONS = {
+    # Default continents.
+    "Africa": {},
+    "Asia": {},
+    "Europe": {},
+    "North America": {},
+    "Oceania": {},
+    "South America": {},
+    # Income groups.
+    "Low-income countries": {},
+    "Upper-middle-income countries": {},
+    "Lower-middle-income countries": {},
+    "High-income countries": {},
+    # Additional composite regions.
+    "Asia (excl. China and India)": {
+        "additional_regions": ["Asia"],
+        "excluded_members": ["China", "India"],
+    },
+    "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]},
+    "Europe (excl. EU-28)": {
+        "additional_regions": ["Europe"],
+        "excluded_regions": ["European Union (27)"],
+        "excluded_members": ["United Kingdom"],
+    },
+    "European Union (28)": {
+        "additional_regions": ["European Union (27)"],
+        "additional_members": ["United Kingdom"],
+    },
+    "North America (excl. USA)": {
+        "additional_regions": ["North America"],
+        "excluded_members": ["United States"],
+    },
+    # EU27 is already included in the original data.
+    # "European Union (27)": {},
+}
+
+
+def run_sanity_checks_on_inputs(tb):
+    # Sanity checks.
+    error = "Names of gases have changed."
+    assert set(tb["gas"]) == set(GASES_RENAMING), error
+    error = "Names of components have changed."
+    assert set(tb["component"]) == set(COMPONENTS_RENAMING), error
+    error = "Units have changed."
+    assert set(tb["unit"]) == set(
+        ["Tg~CH[4]~year^-1", "Pg~CO[2]~year^-1", "Tg~N[2]*O~year^-1", "Pg~CO[2]*-e[100]", "°C"]
+    ), error
+
+
+def add_kuwaiti_oil_fires_to_kuwait(tb: Table) -> Table:
+    tb = tb.copy()
+
+    # NOTE: Use this function before harmonizing country names. Otherwise adapt the following definitions.
+    kuwait = "Kuwait"
+    oil_fires = "Kuwaiti Oil Fires"
+
+    # Sanity check.
+    error = f"'{kuwait}' or '{oil_fires}' not found in the data."
+    assert kuwait in set(tb["country"]), error
+    assert oil_fires in set(tb["country"]), error
+
+    # Add the emissions from the Kuwaiti oil fires (in 1991) to Kuwait.
+    tb_kuwait = tb[tb["country"] == kuwait].drop(columns="country").set_index("year")
+    tb_oil_fires = tb[tb["country"] == oil_fires].drop(columns="country").fillna(0).set_index(["year"])
+    tb_combined = (tb_kuwait + tb_oil_fires).reset_index().assign(**{"country": kuwait})
+
+    # Replace the original data for Kuwait by the combined data.
+    tb_updated = pr.concat([tb[tb["country"] != kuwait].reset_index(drop=True), tb_combined], ignore_index=True)
+
+    # Sort conveniently.
+    tb_updated = tb_updated.sort_values(["country", "year"]).reset_index(drop=True)
+
+    return tb_updated
+
+
+def add_emissions_in_co2_equivalents(tb: Table) -> Table:
+    # Add columns for fossil/land/total emissions of CH4 in terms of CO2 equivalents.
+    # NOTE: For methane, we apply different conversion factors for fossil and land-use emissions.
+    tb["annual_emissions_ch4_fossil_co2eq"] = (
+        tb["annual_emissions_ch4_fossil"] * CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS
+    )
+    tb["annual_emissions_ch4_land_co2eq"] = tb["annual_emissions_ch4_land"] * CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS
+    tb["annual_emissions_ch4_total_co2eq"] = (
+        tb["annual_emissions_ch4_fossil_co2eq"] + tb["annual_emissions_ch4_land_co2eq"]
+    )
+
+    # Add columns for fossil/land/total emissions of N2O in terms of CO2 equivalents.
+    # NOTE: For nitrous oxide, we apply the same conversion factors for fossil and land-use emissions.
+    for component in ["fossil", "land", "total"]:
+        tb[f"annual_emissions_n2o_{component}_co2eq"] = (
+            tb[f"annual_emissions_n2o_{component}"] * N2O_EMISSIONS_TO_CO2_EQUIVALENTS
+        )
+
+    # Add columns for fossil/land/total emissions of all GHG in terms of CO2 equivalents.
+    # NOTE: The file of annual emissions does not include GHG emissions, which is why we need to add them now.
+    #  However, the files of temperature response and cumulative emissions do include GHG emissions.
+    for component in ["fossil", "land", "total"]:
+        tb[f"annual_emissions_ghg_{component}_co2eq"] = (
+            tb[f"annual_emissions_co2_{component}"]
+            + tb[f"annual_emissions_ch4_{component}_co2eq"]
+            + tb[f"annual_emissions_n2o_{component}_co2eq"]
+        )
+
+    return tb
+
+
+def add_share_variables(tb: Table) -> Table:
+    tb = tb.copy()
+
+    # Create "share" variables (percentages with respect to global).
+    # To do that, first create a separate table for global data, and add it to the main table.
+    tb_global = tb[tb["country"] == "World"][["year"] + SHARE_VARIABLES].reset_index(drop=True)
+
+    tb = tb.merge(tb_global, on=["year"], how="left", suffixes=("", "_global"))
+    # For a list of variables, add the percentage with respect to global.
+    for variable in SHARE_VARIABLES:
+        new_variable = f"share_of_{variable.replace('_co2eq', '')}"
+        tb[new_variable] = 100 * tb[variable] / tb[f"{variable}_global"]
+
+    # Drop unnecessary columns for global data.
+    tb = tb.drop(columns=[column for column in tb.columns if column.endswith("_global")], errors="raise")
+
+    return tb
+
+
+def add_per_capita_variables(tb: Table, ds_population: Dataset) -> Table:
+    tb = tb.copy()
+
+    # Add population to data.
+    tb = geo.add_population_to_table(
+        tb=tb,
+        ds_population=ds_population,
+        warn_on_missing_countries=False,
+    )
+
+    # Add per-capita variables.
+    for variable in PER_CAPITA_VARIABLES:
+        tb[f"{variable}_per_capita"] = tb[variable] / tb["population"]
+
+    # Drop population column.
+    tb = tb.drop(columns="population", errors="raise")
+
+    return tb
+
+
+def fix_emissions_jump_in_1850(tb: Table) -> Table:
+    # There is data from 1830 for some variables and from 1850 for others.
+    # However, when inspecting data between 1830 and 1850 (e.g. annual_emissions_co2_total) there is an abrupt jump
+    # between 1849 and 1850, which happens for many countries (e.g. Spain, or World).
+    # This jump seems to be spurious, and therefore we start all time series from 1850.
+
+    # First check that the jump is still in the data.
+    emissions_before_jump = tb[(tb["country"] == "World") & (tb["year"] == 1849)]["annual_emissions_co2_total"].item()
+    emissions_after_jump = tb[(tb["country"] == "World") & (tb["year"] == 1850)]["annual_emissions_co2_total"].item()
+    error = "Spurious jump between 1849 and 1850 is not in the data anymore. Remove this part of the code."
+    assert emissions_after_jump / emissions_before_jump > 10, error
+
+    # Visually inspect the jump.
+    # import plotly.express as px
+    # px.line(tb[tb["country"]=="World"], x="year", y="annual_emissions_co2_total", markers=True)
+
+    # Start all data after the jump.
+    tb = tb[tb["year"] >= 1850].reset_index(drop=True)
+
+    return tb
+
+
+def run_sanity_checks_on_outputs(tb: Table) -> None:
+    error = "Share of global emissions cannot be larger than 101%"
+    assert (tb[[column for column in tb.columns if "share" in column]].max() < 101).all(), error
+    error = "Share of global emissions was not expected to be smaller than -1%"
+    # Some countries did contribute negatively to CO2 emissions, however overall the negative contribution is always
+    # smaller than 1% in absolute value.
+    assert (tb[[column for column in tb.columns if "share" in column]].min() > -1).all(), error
+
+    # Ensure that no country contributes to emissions more than the entire world.
+    columns_that_should_be_smaller_than_global = [
+        column for column in tb.drop(columns=["country", "year"]).columns if "capita" not in column
+    ]
+    tb_global = tb[tb["country"] == "World"].drop(columns="country")
+    check = pr.merge(
+        tb[tb["country"] != "World"].reset_index(drop=True), tb_global, on="year", how="left", suffixes=("", "_global")
+    )
+    for column in columns_that_should_be_smaller_than_global:
+        # It is in principle possible that some region would emit more than the world, if the rest of regions
+        # were contributing with negative CO2 emissions (e.g. High-income countries in 1854).
+        # However, the difference should be very small.
+        error = f"Region contributed to {column} more than the entire world."
+        assert check[(check[column] - check[f"{column}_global"]) / check[f"{column}_global"] > 0.00001].empty, error
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow = paths.load_dataset("national_contributions")
+    tb = ds_meadow["national_contributions"].reset_index()
+
+    # Load regions dataset.
+    ds_regions = paths.load_dataset("regions")
+
+    # Load income groups dataset.
+    ds_income_groups = paths.load_dataset("income_groups")
+
+    # Load population dataset.
+    ds_population = paths.load_dataset("population")
+
+    #
+    # Process data.
+    #
+    # Sanity checks.
+    run_sanity_checks_on_inputs(tb=tb)
+
+    # Rename gases and components.
+    tb["gas"] = Variable(
+        map_series(
+            series=tb["gas"], mapping=GASES_RENAMING, warn_on_missing_mappings=True, warn_on_unused_mappings=True
+        )
+    ).copy_metadata(tb["gas"])
+    tb["component"] = Variable(
+        map_series(
+            series=tb["component"],
+            mapping=COMPONENTS_RENAMING,
+            warn_on_missing_mappings=True,
+            warn_on_unused_mappings=True,
+        )
+    ).copy_metadata(tb["component"])
+
+    # Convert units from teragrams and petagrams to tonnes.
+    tb.loc[tb["unit"].str.startswith("Tg"), "data"] *= TERAGRAMS_TO_TONNES
+    tb.loc[tb["unit"].str.startswith("Pg"), "data"] *= PETAGRAMS_TO_TONNES
+
+    # Transpose data.
+    tb = tb.pivot(
+        index=["country", "year"], columns=["file", "gas", "component"], values="data", join_column_levels_with="_"
+    )
+
+    # We add the emissions from the Kuwaiti oil fires in 1991 (which are also included as a separate country) as part
+    # of the emissions of Kuwait.
+    # This ensures that these emissions will be included in aggregates of regions that include Kuwait.
+    tb = add_kuwaiti_oil_fires_to_kuwait(tb=tb)
+
+    # Harmonize country names.
+    tb = geo.harmonize_countries(
+        tb,
+        countries_file=paths.country_mapping_path,
+        excluded_countries_file=paths.excluded_countries_path,
+    )
+
+    # Replace spurious negative values with zeros (and ensure they are small numbers, within the uncertainty).
+    columns_that_cannot_be_negative = [column for column in tb.columns if "fossil" in column]
+    ####################################################################################################################
+    # TODO: For some reason, cumulative_emissions_ch4_fossil (and therefore cumulative_emissions_ghg_fossil) have
+    #  big negative values. For example for Ireland's value in 2022 is of -2.93e+08!
+    #  I will look into this, but, for now, I'll ignore those negative values (we are not using these indicators in
+    #  any chart).
+    columns_that_cannot_be_negative = [
+        column
+        for column in columns_that_cannot_be_negative
+        if column not in ["cumulative_emissions_ch4_fossil", "cumulative_emissions_ghg_fossil"]
+    ]
+    ####################################################################################################################
+    for column in columns_that_cannot_be_negative:
+        # Ensure all negative values are just numerical noise.
+        assert (tb[column].fillna(0) >= -2e-4).all()
+        # Replace those values by zero.
+        tb[column] = tb[column].clip(lower=0)
+
+    # Add region aggregates.
+    tb = geo.add_regions_to_table(
+        tb=tb, ds_regions=ds_regions, ds_income_groups=ds_income_groups, regions=REGIONS, min_num_values_per_year=1
+    )
+
+    # Add columns for emissions in terms of CO2 equivalents.
+    tb = add_emissions_in_co2_equivalents(tb=tb)
+
+    # Add "share" variables (percentages with respect to global emissions).
+    tb = add_share_variables(tb=tb)
+
+    # Add per-capita variables.
+    tb = add_per_capita_variables(tb=tb, ds_population=ds_population)
+
+    # Fix spurious jump in the data in 1850.
+    tb = fix_emissions_jump_in_1850(tb=tb)
+
+    # Sanity checks.
+    run_sanity_checks_on_outputs(tb=tb)
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.format()
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/emissions/2024-04-08/national_contributions.py b/etl/steps/data/grapher/emissions/2024-04-08/national_contributions.py
new file mode 100644
index 00000000000..a8bf5f2bebf
--- /dev/null
+++ b/etl/steps/data/grapher/emissions/2024-04-08/national_contributions.py
@@ -0,0 +1,22 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset and read its main table.
+    ds_garden = paths.load_dataset("national_contributions")
+    tb_garden = ds_garden["national_contributions"]
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb_garden], check_variables_metadata=True)
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py b/etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py
new file mode 100644
index 00000000000..df58d26b5f6
--- /dev/null
+++ b/etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py
@@ -0,0 +1,50 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve all snapshots of the dataset.
+    snap_annual = paths.load_snapshot("national_contributions_annual_emissions.csv")
+    snap_cumulative = paths.load_snapshot("national_contributions_cumulative_emissions.csv")
+    snap_temperature = paths.load_snapshot("national_contributions_temperature_response.csv")
+
+    # Load data from snapshots.
+    tb_annual = snap_annual.read(underscore=True)
+    tb_cumulative = snap_cumulative.read(underscore=True)
+    tb_temperature = snap_temperature.read(underscore=True)
+
+    #
+    # Process data.
+    #
+    # Combine all data into one table.
+    tb = pr.concat(
+        [
+            tb_annual.assign(**{"file": "annual_emissions"}),
+            tb_cumulative.assign(**{"file": "cumulative_emissions"}),
+            tb_temperature.assign(**{"file": "temperature_response"}),
+        ],
+        ignore_index=True,
+        short_name=paths.short_name,
+    )
+
+    # Rename columns conveniently.
+    tb = tb.rename(columns={"cntr_name": "country"}, errors="raise")
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.format(keys=["country", "year", "file", "gas", "component"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/snapshots/emissions/2024-04-08/national_contributions.py b/snapshots/emissions/2024-04-08/national_contributions.py
new file mode 100644
index 00000000000..ed14f53b5ee
--- /dev/null
+++ b/snapshots/emissions/2024-04-08/national_contributions.py
@@ -0,0 +1,108 @@
+"""Script to create a snapshot of dataset National contributions to climate change (Jones et al.).
+
+NOTE: All metadata fields are automatically updated by this script. However, the dataset description may change a bit
+(for example they may cite more recent papers). Visually inspect the dataset description and manually make small
+modifications, if needed.
+
+"""
+
+from datetime import datetime
+from pathlib import Path
+from typing import Dict
+
+import click
+import requests
+from bs4 import BeautifulSoup
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+# Names of data files to snapshot.
+DATA_FILES = [
+    "annual_emissions.csv",
+    "cumulative_emissions.csv",
+    "temperature_response.csv",
+]
+
+
+@click.command()
+@click.option(
+    "--upload/--skip-upload",
+    default=True,
+    type=bool,
+    help="Upload dataset to Snapshot",
+)
+def main(upload: bool) -> None:
+    for data_file in DATA_FILES:
+        # Create a new snapshot.
+        snap = Snapshot(f"emissions/{SNAPSHOT_VERSION}/national_contributions_{data_file}")
+
+        # Get the publication date (it needs to be done only once).
+        extracted_fields = extract_metadata_from_main_page(snap)
+
+        for field in extracted_fields:
+            # Replace metadata fields with the new extracted fields.
+            setattr(snap.metadata.origin, field, extracted_fields[field])
+
+        # Rewrite metadata to dvc file.
+        snap.metadata_path.write_text(snap.metadata.to_yaml())
+
+        # Download data from source, add file to DVC and upload to S3.
+        snap.create_snapshot(upload=upload)
+
+
+def extract_metadata_from_main_page(snap: Snapshot) -> Dict[str, str]:
+    """Extract the publication date."""
+    # Get the full HTML content of the main page.
+    response = requests.get(snap.metadata.origin.url_main)  # type: ignore
+
+    # The "latest" url redirects to the new record (which we need to extract other fields).
+    response_final = response.url
+
+    # Parse the HTML content of the main page.
+    soup = BeautifulSoup(response.content, "html.parser")
+
+    # Extract the publication date, which is given in one of the first sentences as in, e.g. "Published March 19, 2024".
+    date_published_str = [line.split("Published")[1].strip() for line in soup.text.split("\n") if "Published" in line][
+        0
+    ]
+
+    # Convert to ISO format.
+    date_published = datetime.strptime(date_published_str, "%B %d, %Y").strftime("%Y-%m-%d")
+
+    # Extract the version of the data producer.
+    version_producer = [line.split("| Version ")[1].strip() for line in soup.text.split("\n") if "| Version " in line][
+        0
+    ]
+
+    # The download links have the years hardcoded in the url, so we need to update them.
+    file_name = snap.metadata.origin.url_download.split("/")[-1]  # type: ignore
+    # Assume that the latest informed year in the data is 2 years before the current version.
+    file_name_new = file_name.split("-")[0] + "-" + str(int(version_producer.split(".")[0]) - 2) + ".csv"
+    # Create the new download url (using the new token for the latest version, and the latest year in the file name).
+    url_download = response_final + "/files/" + file_name_new
+
+    # The full citation is not included in the HTML and is fetched from an API.
+    response_citation = requests.get(
+        response_final.replace("records/", "api/records/") + "?style=chicago-fullnote-bibliography",
+        headers={"Accept": "text/x-bibliography"},
+    )
+
+    # Extract the full citation.
+    citation_full = response_citation.text
+
+    # Gather all extracted fields.
+    extracted_fields = {
+        "date_published": date_published,
+        "version_producer": version_producer,
+        "url_download": url_download,
+        "citation_full": citation_full,
+    }
+
+    return extracted_fields
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/emissions/2024-04-08/national_contributions_annual_emissions.csv.dvc b/snapshots/emissions/2024-04-08/national_contributions_annual_emissions.csv.dvc
new file mode 100644
index 00000000000..5ee24c0880e
--- /dev/null
+++ b/snapshots/emissions/2024-04-08/national_contributions_annual_emissions.csv.dvc
@@ -0,0 +1,33 @@
+meta:
+  origin:
+    producer: Jones et al.
+    title: National contributions to climate change
+    description: |-
+      National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide.
+
+      This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources since 1851.
+
+      National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023).
+
+      National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023).
+
+      A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021).
+
+      Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST).
+
+      The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total).
+    title_snapshot: National contributions to climate change - Annual emissions
+    citation_full: |-
+      Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, March 19, 2024. https://doi.org/10.5281/zenodo.10839859.
+    version_producer: '2024.1'
+    url_main: https://zenodo.org/records/7636699/latest
+    url_download: https://zenodo.org/records/10839859/files/EMISSIONS_ANNUAL_1830-2022.csv
+    date_accessed: '2024-04-08'
+    date_published: '2024-03-19'
+    license:
+      name: CC BY 4.0
+      url: https://zenodo.org/records/7636699/latest
+outs:
+  - md5: 9f931081993e0367f14aaeddb338cbcb
+    size: 26279535
+    path: national_contributions_annual_emissions.csv
diff --git a/snapshots/emissions/2024-04-08/national_contributions_cumulative_emissions.csv.dvc b/snapshots/emissions/2024-04-08/national_contributions_cumulative_emissions.csv.dvc
new file mode 100644
index 00000000000..69c308eb405
--- /dev/null
+++ b/snapshots/emissions/2024-04-08/national_contributions_cumulative_emissions.csv.dvc
@@ -0,0 +1,33 @@
+meta:
+  origin:
+    producer: Jones et al.
+    title: National contributions to climate change
+    description: |-
+      National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide.
+
+      This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources during since 1851.
+
+      National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023).
+
+      National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023).
+
+      A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021).
+
+      Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST).
+
+      The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total).
+    title_snapshot: National contributions to climate change - Cumulative emissions
+    citation_full: |-
+      Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, March 19, 2024. https://doi.org/10.5281/zenodo.10839859.
+    version_producer: '2024.1'
+    url_main: https://zenodo.org/records/7636699/latest
+    url_download: https://zenodo.org/records/10839859/files/EMISSIONS_CUMULATIVE_CO2e100_1851-2022.csv
+    date_accessed: '2024-04-08'
+    date_published: '2024-03-19'
+    license:
+      name: CC BY 4.0
+      url: https://zenodo.org/records/7636699/latest
+outs:
+  - md5: f4f7519994d16cee7a791cb9277c0793
+    size: 33485575
+    path: national_contributions_cumulative_emissions.csv
diff --git a/snapshots/emissions/2024-04-08/national_contributions_temperature_response.csv.dvc b/snapshots/emissions/2024-04-08/national_contributions_temperature_response.csv.dvc
new file mode 100644
index 00000000000..dab115b1493
--- /dev/null
+++ b/snapshots/emissions/2024-04-08/national_contributions_temperature_response.csv.dvc
@@ -0,0 +1,33 @@
+meta:
+  origin:
+    producer: Jones et al.
+    title: National contributions to climate change
+    description: |-
+      National contributions to climate change due to historical emissions of carbon dioxide, methane and nitrous oxide.
+
+      This dataset describes the global warming response to national emissions CO₂, CH₄ and N₂O from fossil and land use sources since 1851.
+
+      National CO₂ emissions data are collated from the Global Carbon Project (Andrew and Peters, 2023; Friedlingstein et al., 2023).
+
+      National CH₄ and N₂O emissions data are collated from PRIMAP-hist (HISTTP) (Gütschow et al., 2023).
+
+      A time series of cumulative CO₂-equivalent emissions is constructed for each country, gas, and emissions source (fossil or land use). Emissions of CH₄ and N₂O emissions are related to cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach, with best-estimates of the coefficients taken from the IPCC AR6 (Forster et al., 2021).
+
+      Warming in response to cumulative CO₂-equivalent emissions is estimated using the transient climate response to cumulative carbon emissions (TCRE) approach, with best-estimate value of TCRE taken from the IPCC AR6 (Forster et al., 2021, Canadell et al., 2021). 'Warming' is specifically the change in global mean surface temperature (GMST).
+
+      The data files provide emissions, cumulative emissions and the GMST response by country, gas (CO₂, CH₄, N₂O or 3-GHG total) and source (fossil emissions, land use emissions or the total).
+    title_snapshot: National contributions to climate change - Temperature response
+    citation_full: |-
+      Jones, Matthew W., Glen P. Peters, Thomas Gasser, Robbie M. Andrew, Clemens Schwingshackl, Johannes Gütschow, Richard A. Houghton, Pierre Friedlingstein, Julia Pongratz, and Corinne Le Quéré. “National Contributions to Climate Change Due to Historical Emissions of Carbon Dioxide, Methane and Nitrous Oxide”. Scientific Data. Zenodo, March 19, 2024. https://doi.org/10.5281/zenodo.10839859.
+    version_producer: '2024.1'
+    url_main: https://zenodo.org/records/7636699/latest
+    url_download: https://zenodo.org/records/10839859/files/GMST_response_1851-2022.csv
+    date_accessed: '2024-04-08'
+    date_published: '2024-03-19'
+    license:
+      name: CC BY 4.0
+      url: https://zenodo.org/records/7636699/latest
+outs:
+  - md5: e46a789f557012f78c6fb98a1816a797
+    size: 28745402
+    path: national_contributions_temperature_response.csv

From a4f816a1a4909b14ee3e8ea0ee3700bd53f2c969 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Wed, 10 Apr 2024 12:29:34 +0200
Subject: [PATCH 05/61] :sparkles: Improve datadiff (#2494)

* :sparkles: improve datadiff
---
 apps/backport/backport.py             |   4 +-
 apps/backport/bulk_backport.py        |   4 +-
 apps/metadata_migrate/cli.py          |   7 +-
 etl/compare.py                        |  14 +-
 etl/data_helpers/population.py        |   2 +-
 etl/datadiff.py                       | 335 +++++++++++++++++++-------
 etl/db.py                             |  24 +-
 etl/grapher_helpers.py                |   6 +-
 etl/grapher_import.py                 |   8 +-
 etl/variable_mapping_translate.py     |   5 +-
 lib/catalog/owid/catalog/variables.py |  30 ++-
 lib/repack/owid/repack/__init__.py    |   5 +-
 lib/repack/tests/test_repack.py       |  19 ++
 tests/data_helpers/test_geo.py        |   4 +-
 tests/test_datadiff.py                |  46 ++--
 15 files changed, 368 insertions(+), 145 deletions(-)

diff --git a/apps/backport/backport.py b/apps/backport/backport.py
index bb0186581f4..b3fbe2c2a65 100644
--- a/apps/backport/backport.py
+++ b/apps/backport/backport.py
@@ -20,7 +20,7 @@
 from etl import config, paths
 from etl import grapher_model as gm
 from etl.backport_helpers import GrapherConfig
-from etl.db import get_engine
+from etl.db import get_engine, read_sql
 from etl.files import checksum_str
 from etl.snapshot import Snapshot, SnapshotMeta
 
@@ -346,7 +346,7 @@ def _load_values(engine: Engine, variable_ids: list[int]) -> pd.DataFrame:
             "entityCode": "entity_code",
         }
     )
-    vf: pd.DataFrame = pd.read_sql(q, engine, params={"variable_ids": variable_ids})
+    vf = read_sql(q, engine, params={"variable_ids": variable_ids})
     df = df.merge(vf, on="variable_id")
 
     # try converting values to float if possible, this can make the data 50% smaller
diff --git a/apps/backport/bulk_backport.py b/apps/backport/bulk_backport.py
index c9d61ceacfe..bb10fd859f0 100644
--- a/apps/backport/bulk_backport.py
+++ b/apps/backport/bulk_backport.py
@@ -9,7 +9,7 @@
 from sqlalchemy.engine import Engine
 
 from etl import config
-from etl.db import get_engine
+from etl.db import get_engine, read_sql
 from etl.snapshot import snapshot_catalog
 from etl.steps import load_dag
 
@@ -195,7 +195,7 @@ def _active_datasets(
         limit %(limit)s
         """
 
-    df = pd.read_sql(
+    df = read_sql(
         q,
         engine,
         params={
diff --git a/apps/metadata_migrate/cli.py b/apps/metadata_migrate/cli.py
index 48201ac4dcc..69aa7abba17 100644
--- a/apps/metadata_migrate/cli.py
+++ b/apps/metadata_migrate/cli.py
@@ -3,7 +3,6 @@
 from typing import Any, Dict, List, Optional
 
 import click
-import pandas as pd
 import structlog
 from owid.catalog import Dataset, DatasetMeta, License, Origin, Source, Table
 from rich import print
@@ -16,7 +15,7 @@
 from etl import config
 from etl import grapher_model as gm
 from etl.command import main as etl_main
-from etl.db import get_engine
+from etl.db import get_engine, read_sql
 from etl.metadata_export import merge_or_create_yaml, reorder_fields
 from etl.paths import BASE_DIR, DAG_FILE, DATA_DIR, STEP_DIR
 
@@ -108,7 +107,7 @@ def cli(
         select config from charts
         where slug = '{chart_slug}'
         """
-        df = pd.read_sql(q, engine)
+        df = read_sql(q, engine)
         if df.empty:
             raise ValueError(f"no chart found for slug {chart_slug}")
 
@@ -359,7 +358,7 @@ def _load_grapher_config(engine: Engine, col: str, ds_meta: DatasetMeta) -> Dict
         d.version = '{ds_meta.version}' and
         d.shortName = '{ds_meta.short_name}'
     """
-    cf = pd.read_sql(q, engine)
+    cf = read_sql(q, engine)
     if len(cf) == 0:
         log.warning(f"no chart found for variable {col}")
         return {}
diff --git a/etl/compare.py b/etl/compare.py
index 527ae17601f..a690224f3b3 100644
--- a/etl/compare.py
+++ b/etl/compare.py
@@ -17,7 +17,7 @@
 
 from apps.backport.datasync.data_metadata import variable_data_df_from_s3
 from etl import tempcompare
-from etl.db import get_engine
+from etl.db import get_engine, read_sql
 
 
 @click.group(name="compare", cls=RichGroup)
@@ -293,11 +293,7 @@ def read_dataset_from_db(env_path: str, namespace: str, version: str, dataset: s
     WHERE version = %(version)s and namespace = %(namespace)s and shortName = %(dataset)s
     """
 
-    df = pd.read_sql(
-        q,
-        engine,
-        params={"version": version, "namespace": namespace, "dataset": dataset},
-    )
+    df = read_sql(q, engine, params={"version": version, "namespace": namespace, "dataset": dataset})
 
     # drop uninteresting columns
     df = df.drop(["createdByUserId", "dataEditedAt", "metadataEditedAt", "updatedAt"], axis=1)
@@ -316,7 +312,7 @@ def read_variables_from_db(env_path: str, namespace: str, version: str, dataset:
     WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s
     """
 
-    df = pd.read_sql(
+    df = read_sql(
         q,
         engine,
         params={"version": version, "namespace": namespace, "dataset": dataset},
@@ -341,7 +337,7 @@ def read_sources_from_db(env_path: str, namespace: str, version: str, dataset: s
     WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s
     """
 
-    df = pd.read_sql(
+    df = read_sql(
         q,
         engine,
         params={"version": version, "namespace": namespace, "dataset": dataset},
@@ -365,7 +361,7 @@ def read_values_from_s3(env_path: str, namespace: str, version: str, dataset: st
     JOIN datasets as d ON v.datasetId = d.id
     WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s
     """
-    vf = pd.read_sql(
+    vf = read_sql(
         q,
         engine,
         params={"version": version, "namespace": namespace, "dataset": dataset},
diff --git a/etl/data_helpers/population.py b/etl/data_helpers/population.py
index d768aab7fa9..6b150b4033c 100644
--- a/etl/data_helpers/population.py
+++ b/etl/data_helpers/population.py
@@ -111,7 +111,7 @@ def add_population(
 
         # Build age groups
         df_pop = []
-        pop["age"] = pop["age"].replace({"100+": 100}).astype("uint")
+        pop["age"] = pop["age"].astype(str).replace({"100+": 100}).astype("uint")
         for age_group_name, age_ranges in age_group_mapping.items():
             if not age_ranges:
                 age_ranges = [None, None]
diff --git a/etl/datadiff.py b/etl/datadiff.py
index 3fa6dc26d33..f45008a80df 100644
--- a/etl/datadiff.py
+++ b/etl/datadiff.py
@@ -3,7 +3,7 @@
 import re
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, cast
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast
 
 import numpy as np
 import pandas as pd
@@ -11,7 +11,7 @@
 import rich
 import rich_click as click
 import structlog
-from owid.catalog import Dataset, DatasetMeta, LocalCatalog, RemoteCatalog, Table, find
+from owid.catalog import Dataset, DatasetMeta, LocalCatalog, RemoteCatalog, Table, VariableMeta, find
 from owid.catalog.catalogs import CHANNEL, OWID_CATALOG_URI
 from rich.console import Console
 from rich.panel import Panel
@@ -111,16 +111,33 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str):
             for col in ds_b[table_name].columns:
                 self.p(f"\t\t[green]+ Column [b]{col}[/b]")
         else:
-            table_a = ds_a[table_name]
-            table_b = ds_b[table_name]
+            # get both tables in parallel
+            with ThreadPoolExecutor() as executor:
+                future_a = executor.submit(ds_a.__getitem__, table_name)
+                future_b = executor.submit(ds_b.__getitem__, table_name)
+
+                table_a = future_a.result()
+                table_b = future_b.result()
 
             # set default index for datasets that don't have one
             if table_a.index.names == [None] and table_b.index.names == [None]:
                 candidates = {"entity", "date", "country", "year"}
-                new_index = list(candidates & set(table_a.columns) & set(table_b.columns))
-                if new_index:
-                    table_a = table_a.set_index(new_index)
-                    table_b = table_b.set_index(new_index)
+                new_index_cols = list(candidates & set(table_a.columns) & set(table_b.columns))
+                if new_index_cols:
+                    table_a = table_a.set_index(new_index_cols)
+                    table_b = table_b.set_index(new_index_cols)
+
+            # if using default index, it is possible that we have non-determinstic order
+            # try sorting by the first two columns
+            if (
+                table_a.index.names == [None]
+                and table_b.index.names == [None]
+                and len(table_a) == len(table_b)
+                and table_a.index[-1] == len(table_a) - 1
+                and len(table_a) <= 1000
+            ):
+                table_a = table_a.sort_values(list(table_a.columns)).reset_index(drop=True)
+                table_b = table_b.sort_values(list(table_b.columns)).reset_index(drop=True)
 
             # indexes differ, reset them to make them somehow comparable
             if table_a.index.names != table_b.index.names:
@@ -131,21 +148,19 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str):
 
             # only sort index if different to avoid unnecessary sorting for huge datasets such as ghe
             if len(table_a) != len(table_b) or not _index_equals(table_a, table_b):
-                index_diff = True
-                table_a, table_b, eq_index = _align_tables(table_a, table_b)
-
-                # if only index order has changed, don't report it
-                if eq_index.all():
-                    index_diff = False
+                table_a, table_b, eq_index, new_index, removed_index = _align_tables(table_a, table_b)
             else:
-                index_diff = False
                 eq_index = pd.Series(True, index=table_a.index)
+                new_index = pd.Series(False, index=table_a.index)
+                removed_index = pd.Series(False, index=table_a.index)
 
             # resetting index will make comparison easier
-            dims = table_a.index.names
+            dims = [dim for dim in table_a.index.names if dim is not None]
             table_a: Table = table_a.reset_index()
             table_b: Table = table_b.reset_index()
-            eq_index = eq_index.reset_index(drop=True)
+            eq_index = cast(pd.Series, eq_index.reset_index(drop=True))
+            new_index = cast(pd.Series, new_index.reset_index(drop=True))
+            removed_index = cast(pd.Series, removed_index.reset_index(drop=True))
 
             # compare table metadata
             diff = _dict_diff(_table_metadata_dict(table_a), _table_metadata_dict(table_b), tabs=3)
@@ -157,8 +172,31 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str):
             else:
                 self.p(f"\t[white]= Table [b]{table_name}[/b]")
 
+            # compare index
+            if not eq_index.all():
+                for dim in dims:
+                    if eq_index.all():
+                        self.p(f"\t\t[white]= Dim [b]{dim}[/b]")
+                    else:
+                        self.p(f"\t\t[yellow]~ Dim [b]{dim}[/b]")
+                        if self.verbose:
+                            dims_without_dim = [d for d in dims if d != dim]
+                            out = _data_diff(
+                                table_a,
+                                table_b,
+                                dim,
+                                dims_without_dim,
+                                eq_index,
+                                eq_index,
+                                new_index,
+                                removed_index,
+                                tabs=4,
+                            )
+                            if out:
+                                self.p(out)
+
             # compare columns
-            all_cols = sorted(set(table_a.columns) | set(table_b.columns))
+            all_cols = sorted((set(table_a.columns) | set(table_b.columns)) - set(dims))
             for col in all_cols:
                 if self.cols and not re.search(self.cols, col):
                     continue
@@ -171,31 +209,33 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str):
                     col_a = table_a[col]
                     col_b = table_b[col]
 
+                    # metadata diff
+                    meta_diff = _dict_diff(
+                        _column_metadata_dict(col_a.metadata), _column_metadata_dict(col_b.metadata), tabs=4
+                    )
+
                     # equality on index and series
                     eq_data = series_equals(table_a[col], table_b[col])
-                    data_diff = (~eq_data).any()
-                    eq = eq_index & eq_data
-
-                    col_a_meta = col_a.metadata.to_dict()
-                    col_b_meta = col_b.metadata.to_dict()
 
-                    meta_diff = _dict_diff(col_a_meta, col_b_meta, tabs=4)
-
-                    changed = (
-                        (["data"] if data_diff else [])
-                        + (["metadata"] if meta_diff else [])
-                        + (["index"] if index_diff else [])
-                    )
+                    changed = []
+                    if meta_diff:
+                        changed.append("changed [u]metadata[/u]")
+                    if new_index.any():
+                        changed.append("new [u]data[/u]")
+                    if (~eq_data[~new_index]).any():
+                        changed.append("changed [u]data[/u]")
 
                     if changed:
-                        self.p(f"\t\t[yellow]~ Column [b]{col}[/b] (changed [u]{' & '.join(changed)}[/u])")
+                        self.p(f"\t\t[yellow]~ Column [b]{col}[/b] ({', '.join(changed)})")
                         if self.verbose:
                             if meta_diff:
-                                self.p(_dict_diff(col_a_meta, col_b_meta, tabs=4))
-                            if data_diff or index_diff:
+                                self.p(meta_diff)
+                            if new_index.any() or removed_index.any() or (~eq_data).any():
                                 if meta_diff:
                                     self.p("")
-                                out = _data_diff(table_a, table_b, col, dims, tabs=4, eq=eq)
+                                out = _data_diff(
+                                    table_a, table_b, col, dims, eq_data, eq_index, new_index, removed_index, tabs=4
+                                )
                                 if out:
                                     self.p(out)
                     else:
@@ -279,6 +319,13 @@ def __getitem__(self, name: str) -> Table:
     is_flag=True,
     help="Print code snippet for loading both tables, useful for debugging in notebook",
 )
+@click.option(
+    "--workers",
+    "-w",
+    type=int,
+    help="Use multiple threads.",
+    default=1,
+)
 def cli(
     path_a: str,
     path_b: str,
@@ -288,11 +335,14 @@ def cli(
     exclude: Optional[str],
     verbose: bool,
     snippet: bool,
+    workers: int,
 ) -> None:
     """Compare all datasets from two catalogs and print out a summary of their differences.
 
     Compare all the datasets from catalog in `PATH_A` with all the datasets in catalog `PATH_B`. The catalog paths link to the `data/` folder with all the datasets (it contains a `catalog.meta.json` file)
 
+    You can also use a path to a dataset.
+
     Note that you can use the keyword "REMOTE" as the path, if you want to run a comparison with the remote catalog.
 
     This tool is useful as a quick way to see what has changed in the catalog and whether our updates don't have any unexpected side effects.
@@ -320,14 +370,24 @@ def cli(
     path_to_ds_a = _load_catalog_datasets(path_a, channel, include, exclude)
     path_to_ds_b = _load_catalog_datasets(path_b, channel, include, exclude)
 
-    # only keep datasets in DAG
+    # only keep datasets in DAG, unless there's only one dataset selected by precise path
     dag_steps = {s.split("://")[1] for s in load_dag().keys()}
-    path_to_ds_a = {k: v for k, v in path_to_ds_a.items() if k in dag_steps}
-    path_to_ds_b = {k: v for k, v in path_to_ds_b.items() if k in dag_steps}
+    if len(path_to_ds_a) > 1:
+        path_to_ds_a = {k: v for k, v in path_to_ds_a.items() if k in dag_steps}
+    if len(path_to_ds_b) > 1:
+        path_to_ds_b = {k: v for k, v in path_to_ds_b.items() if k in dag_steps}
+
+    if not path_to_ds_a:
+        console.print(f"[yellow]❓ No datasets found in {path_a}[/yellow]")
+        exit(0)
+    if not path_to_ds_b:
+        console.print(f"[yellow]❓ No datasets found in {path_b}[/yellow]")
+        exit(0)
 
     any_diff = False
     any_error = False
 
+    matched_datasets = []
     for path in sorted(set(path_to_ds_a.keys()) | set(path_to_ds_b.keys())):
         ds_a = _match_dataset(path_to_ds_a, path)
         ds_b = _match_dataset(path_to_ds_b, path)
@@ -337,27 +397,65 @@ def cli(
             # to improve performance. Source checksum should be enough
             continue
 
-        lines = []
+        matched_datasets.append((ds_a, ds_b))
 
-        def _append_and_print(x):
-            lines.append(x)
-            console.print(x)
+    if workers > 1:
+        futures = []
 
-        try:
-            differ = DatasetDiff(ds_a, ds_b, cols=cols, print=_append_and_print, verbose=verbose, snippet=snippet)
-            differ.summary()
-        except DatasetError as e:
-            # soft fail and continue with another dataset
-            _append_and_print(f"[bold red]⚠ Error: {e}[/bold red]")
-            continue
-        except Exception as e:
-            # soft fail and continue with another dataset
-            log.error(e, exc_info=True)
-            any_error = True
-            continue
+        with ThreadPoolExecutor(max_workers=workers) as executor:
+            for ds_a, ds_b in matched_datasets:
+
+                def func(ds_a, ds_b):
+                    lines = []
+                    differ = DatasetDiff(
+                        ds_a, ds_b, cols=cols, print=lambda x: lines.append(x), verbose=verbose, snippet=snippet
+                    )
+                    differ.summary()
+                    return lines
+
+                futures.append(executor.submit(func, ds_a, ds_b))
+
+            for future in futures:
+                try:
+                    lines = future.result()
+                except DatasetError as e:
+                    # soft fail and continue with another dataset
+                    lines = [f"[bold red]⚠ Error: {e}[/bold red]"]
+                except Exception as e:
+                    # soft fail and continue with another dataset
+                    log.error(e, exc_info=True)
+                    any_error = True
+                    lines = []
+                    continue
 
-        if any("~" in line for line in lines if isinstance(line, str)):
-            any_diff = True
+                for line in lines:
+                    console.print(line)
+
+                    if "~" in line:
+                        any_diff = True
+    else:
+        for ds_a, ds_b in matched_datasets:
+            lines = []
+
+            def _append_and_print(x):
+                lines.append(x)
+                console.print(x)
+
+            try:
+                differ = DatasetDiff(ds_a, ds_b, cols=cols, print=_append_and_print, verbose=verbose, snippet=snippet)
+                differ.summary()
+            except DatasetError as e:
+                # soft fail and continue with another dataset
+                _append_and_print(f"[bold red]⚠ Error: {e}[/bold red]")
+                continue
+            except Exception as e:
+                # soft fail and continue with another dataset
+                log.error(e, exc_info=True)
+                any_error = True
+                continue
+
+            if any("~" in line for line in lines if isinstance(line, str)):
+                any_diff = True
 
     console.print()
     if not path_to_ds_a and not path_to_ds_b:
@@ -388,8 +486,8 @@ def _index_equals(table_a: pd.DataFrame, table_b: pd.DataFrame, sample: int = 10
         index_a = table_a.index
         index_b = table_b.index
     else:
-        index_a = table_a.sample(sample, random_state=0).index
-        index_b = table_b.sample(sample, random_state=0).index
+        index_a = table_a.sample(sample, random_state=0, replace=True).index
+        index_b = table_b.sample(sample, random_state=0, replace=True).index
 
     return index_a.equals(index_b)
 
@@ -413,23 +511,82 @@ def _dict_diff(dict_a: Dict[str, Any], dict_b: Dict[str, Any], tabs: int = 0, **
         return "\t" * tabs + "".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip()
 
 
+def _df_to_str(df: pd.DataFrame, limit: int = 5) -> list[str]:
+    lines = []
+    if len(df) > limit:
+        df_samp = df.sample(limit, random_state=0).sort_index()
+    else:
+        df_samp = df
+
+    for line in df_samp.to_string(index=False).split("\n"):  # type: ignore
+        lines.append("  " + line)
+    return lines
+
+
 def _data_diff(
-    table_a: Table, table_b: Table, col: str, dims: list[str], tabs: int, eq: Optional[pd.Series] = None
+    table_a: Table,
+    table_b: Table,
+    col: str,
+    dims: list[str],
+    eq_data: pd.Series,
+    eq_index: pd.Series,
+    new_index: pd.Series,
+    removed_index: pd.Series,
+    tabs: int = 0,
 ) -> str:
     """Return summary of data differences."""
-    if eq is None:
-        eq = series_equals(table_a[col], table_b[col])
+    # eq = eq_data & eq_index
+    n = (eq_index | new_index).sum()
 
-    lines = [
-        f"- Changed values: {(~eq).sum()} / {len(eq)} ({(~eq).sum() / len(eq) * 100:.2f}%)",
-    ]
+    lines = []
+
+    cols = [d for d in dims if d is not None] + [col]
+
+    # new values
+    if new_index.any():
+        lines.append(
+            f"+ New values: {new_index.sum()} / {n} ({new_index.sum() / n * 100:.2f}%)",
+        )
+        lines += _df_to_str(table_b.loc[new_index, cols])
+
+    # removed values
+    if removed_index.any():
+        lines.append(
+            f"- Removed values: {removed_index.sum()} / {n} ({removed_index.sum() / n * 100:.2f}%)",
+        )
+        lines += _df_to_str(table_a.loc[removed_index, cols])
+
+    # changed values
+    neq = ~eq_data & eq_index
+    if neq.any():
+        lines.append(
+            f"~ Changed values: {neq.sum()} / {n} ({neq.sum() / n * 100:.2f}%)",
+        )
+        samp_a = table_a.loc[neq, cols]
+        samp_b = table_b.loc[neq, cols]
+        both = samp_a.merge(samp_b, on=dims, suffixes=(" -", " +"))
+        lines += _df_to_str(both)
 
+    # add color
+    lines = ["[violet]" + line for line in lines]
+
+    if not lines:
+        return ""
+    else:
+        # add tabs
+        return "\t" * tabs + "\n".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip()
+
+    """OLD CODE, PARTS OF IT COULD BE STILL USEFUL
     # changes in index
     for dim in dims:
         if dim is not None:
             diff_elements = table_a.loc[~eq, dim].dropna().astype(str).sort_values().unique().tolist()
             detail = f"{len(diff_elements)} affected" if len(diff_elements) > 5 else ", ".join(diff_elements)
-            lines.append(f"- {dim}: {detail}")
+            lines.append(f"- Dim `{dim}`: {detail}")
+
+    lines.append(
+        f"- Changed values: {(~eq).sum()} / {len(eq)} ({(~eq).sum() / len(eq) * 100:.2f}%)",
+    )
 
     # changes in values
     if (
@@ -452,15 +609,7 @@ def _data_diff(
         rel_diff = abs_diff / mean if not pd.isnull(mean) and mean != 0 else np.nan
 
         lines.append(f"- Avg. change: {abs_diff:.2f} ({rel_diff:.0%})")
-
-    # add color
-    lines = ["[violet]" + line for line in lines]
-
-    if not lines:
-        return ""
-    else:
-        # add tabs
-        return "\t" * tabs + "\n".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip()
+    """
 
 
 def _is_datetime(dtype: Any) -> bool:
@@ -470,7 +619,7 @@ def _is_datetime(dtype: Any) -> bool:
         return False
 
 
-def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Series]:
+def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Series, pd.Series, pd.Series]:
     if not table_a.index.is_unique or not table_b.index.is_unique:
         raise DatasetError("Index must be unique.")
 
@@ -488,11 +637,14 @@ def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Seri
     table_b["_x"] = 1
     table_a, table_b = table_a.align(table_b, join="outer", copy=False)
 
-    eq_index = table_a["_x"].notnull() & table_b["_x"].notnull()
+    new_index = table_a["_x"].isnull()
+    removed_index = table_b["_x"].isnull()
+
+    eq_index = ~(new_index | removed_index)
     table_a.drop(columns="_x", inplace=True)
     table_b.drop(columns="_x", inplace=True)
 
-    return cast(Table, table_a), cast(Table, table_b), eq_index
+    return cast(Table, table_a), cast(Table, table_b), eq_index, new_index, removed_index
 
 
 def _sort_index(df: Table) -> Table:
@@ -554,10 +706,20 @@ def _table_metadata_dict(tab: Table) -> Dict[str, Any]:
     # for col in tab.columns:
     #     d["columns"][col] = tab[col].metadata.to_dict()
 
+    # sort primary key
+    if "primary_key" in d:
+        d["primary_key"] = sorted(d["primary_key"])
+
     del d["dataset"]
     return d
 
 
+def _column_metadata_dict(meta: VariableMeta) -> Dict[str, Any]:
+    d = meta.to_dict()
+    d.pop("processing_log", None)
+    return d
+
+
 def _dataset_metadata_dict(ds: Dataset) -> Dict[str, Any]:
     """Extract metadata from Dataset object, prune and and return it as a dictionary"""
     d = ds.metadata.to_dict()
@@ -571,10 +733,21 @@ def _dataset_metadata_dict(ds: Dataset) -> Dict[str, Any]:
 
 
 def _local_catalog_datasets(
-    catalog_path: str, channels: Iterable[CHANNEL], include: Optional[str], exclude: Optional[str]
+    catalog_path: Union[str, Path], channels: Iterable[CHANNEL], include: Optional[str], exclude: Optional[str]
 ) -> Dict[str, Dataset]:
     """Return a mapping from dataset path to Dataset object of local catalog."""
-    lc_a = LocalCatalog(catalog_path, channels=channels)
+    catalog_path = Path(catalog_path)
+    catalog_dir = catalog_path
+
+    # it is possible to use subset of a data catalog
+    while not (catalog_dir / "catalog.meta.json").exists() and catalog_dir != catalog_dir.parent:
+        catalog_dir = catalog_dir.parent
+
+    if catalog_dir != catalog_path:
+        assert include is None, "Include pattern is not supported for subset of a catalog"
+        include = str(catalog_path.relative_to(catalog_dir))
+
+    lc_a = LocalCatalog(catalog_dir, channels=channels)
     datasets = []
     for chan in lc_a.channels:
         channel_datasets = list(lc_a.iter_datasets(chan, include=include))
@@ -585,7 +758,7 @@ def _local_catalog_datasets(
         datasets += channel_datasets
 
     # keep only relative path of dataset
-    mapping = {str(Path(ds.path).relative_to(catalog_path)): ds for ds in datasets}
+    mapping = {str(Path(ds.path).relative_to(catalog_dir)): ds for ds in datasets}
 
     if exclude:
         re_exclude = re.compile(exclude)
@@ -619,10 +792,10 @@ def _remote_catalog_datasets(channels: Iterable[CHANNEL], include: str, exclude:
     ds_paths = frame["ds_paths"]
 
     if include:
-        ds_paths = ds_paths[ds_paths.str.contains(include)]
+        ds_paths = ds_paths[ds_paths.str.contains(include, regex=True)]
 
     if exclude:
-        ds_paths = ds_paths[~ds_paths.str.contains(exclude)]
+        ds_paths = ds_paths[~ds_paths.str.contains(exclude, regex=True)]
 
     ds_paths = set(ds_paths)
 
diff --git a/etl/db.py b/etl/db.py
index e2a7a9b9fa8..dcd13ba0e3c 100644
--- a/etl/db.py
+++ b/etl/db.py
@@ -2,7 +2,7 @@
 import warnings
 from collections.abc import Generator
 from contextlib import contextmanager
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, cast
 from urllib.parse import quote
 
 import MySQLdb
@@ -49,10 +49,13 @@ def get_session(**kwargs) -> Session:
 def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine:
     cf: Any = dict_to_object(conf) if conf else config
 
-    return create_engine(
-        f"mysql://{cf.DB_USER}:{quote(cf.DB_PASS)}@{cf.DB_HOST}:{cf.DB_PORT}/{cf.DB_NAME}",
-        pool_size=30,  # Increase the pool size to allow higher GRAPHER_WORKERS
-        max_overflow=30,  # Increase the max overflow limit to allow higher GRAPHER_WORKERS
+    return cast(
+        Engine,
+        create_engine(
+            f"mysql://{cf.DB_USER}:{quote(cf.DB_PASS)}@{cf.DB_HOST}:{cf.DB_PORT}/{cf.DB_NAME}",
+            pool_size=30,  # Increase the pool size to allow higher GRAPHER_WORKERS
+            max_overflow=30,  # Increase the max overflow limit to allow higher GRAPHER_WORKERS
+        ),
     )
 
 
@@ -459,3 +462,14 @@ def get_info_for_etl_datasets(db_conn: Optional[MySQLdb.Connection] = None) -> p
     df.loc[df["is_private"], "step"] = df[df["is_private"]]["step"].str.replace("data://", "data-private://")
 
     return df
+
+
+def read_sql(sql: str, engine: Optional[Engine] = None, *args, **kwargs) -> pd.DataFrame:
+    """Wrapper around pd.read_sql that creates a connection and closes it after reading the data.
+    This adds overhead, so if you need performance, reuse the same connection and cursor.
+    """
+    engine = engine or get_engine()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", UserWarning)
+        with engine.connect() as con:
+            return pd.read_sql(sql, con.connection, *args, **kwargs)
diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py
index a6bc869bc35..064ca1401ad 100644
--- a/etl/grapher_helpers.py
+++ b/etl/grapher_helpers.py
@@ -13,7 +13,7 @@
 from owid import catalog
 from owid.catalog.utils import underscore
 
-from etl.db import get_connection, get_engine
+from etl.db import get_connection, read_sql
 from etl.db_utils import DBUtils
 from etl.files import checksum_str
 
@@ -303,7 +303,7 @@ def long_to_wide_tables(
 
 def _get_entities_from_db(countries: Set[str], by: Literal["name", "code"]) -> Dict[str, int]:
     q = f"select id as entity_id, {by} from entities where {by} in %(names)s"
-    df = pd.read_sql(q, get_engine(), params={"names": list(countries)})
+    df = read_sql(q, params={"names": list(countries)})
     return cast(Dict[str, int], df.set_index(by).entity_id.to_dict())
 
 
@@ -498,6 +498,8 @@ def _adapt_table_for_grapher(
     assert {"year", country_col} <= set(table.columns), f"Table must have columns {country_col} and year."
     assert "entity_id" not in table.columns, "Table must not have column entity_id."
 
+    table[country_col] = table[country_col].astype(str)
+
     # Grapher needs a column entity id, that is constructed based on the unique entity names in the database.
     table["entity_id"] = country_to_entity_id(table[country_col], create_entities=True)
     table = table.drop(columns=[country_col]).rename(columns={year_col: "year"})
diff --git a/etl/grapher_import.py b/etl/grapher_import.py
index 97533efc383..346b42a5043 100644
--- a/etl/grapher_import.py
+++ b/etl/grapher_import.py
@@ -213,10 +213,12 @@ def upsert_table(
         "Tables to be upserted must have no null values. Instead they" f" have:\n{table.loc[table.iloc[:, 0].isnull()]}"
     )
     table = table.reorder_levels(["year", "entity_id"])
-    assert table.index.dtypes[0] in gh.INT_TYPES, f"year must be of an integer type but was: {table.index.dtypes[0]}"
     assert (
-        table.index.dtypes[1] in gh.INT_TYPES
-    ), f"entity_id must be of an integer type but was: {table.index.dtypes[1]}"
+        table.index.dtypes.iloc[0] in gh.INT_TYPES
+    ), f"year must be of an integer type but was: {table.index.dtypes.iloc[0]}"
+    assert (
+        table.index.dtypes.iloc[1] in gh.INT_TYPES
+    ), f"entity_id must be of an integer type but was: {table.index.dtypes.iloc[1]}"
     utils.validate_underscore(table.metadata.short_name, "Table's short_name")
     utils.validate_underscore(table.columns[0], "Variable's name")
 
diff --git a/etl/variable_mapping_translate.py b/etl/variable_mapping_translate.py
index e7d49632225..69e9a187be2 100644
--- a/etl/variable_mapping_translate.py
+++ b/etl/variable_mapping_translate.py
@@ -9,6 +9,8 @@
 from sqlalchemy import create_engine
 from sqlalchemy.engine.base import Engine
 
+from etl.db import read_sql
+
 log = structlog.get_logger()
 
 
@@ -191,8 +193,7 @@ def _run_query_mapping_to_df(sql: Engine, variable_ids: Tuple[str, ...]) -> pd.D
         left join datasets on variables.datasetId=datasets.id
         where variables.id in %(variable_ids)s;
     """
-    df: pd.DataFrame = pd.read_sql_query(query, sql, params={"variable_ids": variable_ids})
-    return df
+    return read_sql(query, sql, params={"variable_ids": variable_ids})
 
 
 def _build_dfs(sql: Engine, mapping: Dict[str, str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
diff --git a/lib/catalog/owid/catalog/variables.py b/lib/catalog/owid/catalog/variables.py
index 565a9dbae85..addc4d02d01 100644
--- a/lib/catalog/owid/catalog/variables.py
+++ b/lib/catalog/owid/catalog/variables.py
@@ -367,31 +367,35 @@ def _get_metadata_value_from_variables_if_all_identical(
 
 def get_unique_sources_from_variables(variables: List[Variable]) -> List[Source]:
     # Make a list of all sources of all variables.
-    sources = sum([variable.metadata.sources for variable in variables], [])
-
-    return pd.unique(sources).tolist()
+    sources = []
+    for variable in variables:
+        sources += [s for s in variable.metadata.sources if s not in sources]
+    return sources
 
 
 def get_unique_origins_from_variables(variables: List[Variable]) -> List[Origin]:
     # Make a list of all origins of all variables.
-    origins = sum([variable.metadata.origins for variable in variables], [])
-
-    # Get unique array of tuples of origin fields (respecting the order).
-    return pd.unique(origins).tolist()
+    origins = []
+    for variable in variables:
+        # Get unique array of tuples of origin fields (respecting the order).
+        origins += [o for o in variable.metadata.origins if o not in origins]
+    return origins
 
 
 def get_unique_licenses_from_variables(variables: List[Variable]) -> List[License]:
     # Make a list of all licenses of all variables.
-    licenses = sum([variable.metadata.licenses for variable in variables], [])
-
-    return pd.unique(licenses).tolist()
+    licenses = []
+    for variable in variables:
+        licenses += [license for license in variable.metadata.licenses if license not in licenses]
+    return licenses
 
 
 def get_unique_description_key_points_from_variables(variables: List[Variable]) -> List[str]:
     # Make a list of all description key points of all variables.
-    description_key_points = sum([variable.metadata.description_key for variable in variables], [])
-
-    return pd.unique(description_key_points).tolist()
+    description_key_points = []
+    for variable in variables:
+        description_key_points += [k for k in variable.metadata.description_key if k not in description_key_points]
+    return description_key_points
 
 
 def combine_variables_processing_logs(variables: List[Variable]) -> ProcessingLog:
diff --git a/lib/repack/owid/repack/__init__.py b/lib/repack/owid/repack/__init__.py
index 32fbe1c63b6..33017514990 100644
--- a/lib/repack/owid/repack/__init__.py
+++ b/lib/repack/owid/repack/__init__.py
@@ -1,3 +1,4 @@
+import datetime as dt
 from typing import Any, Dict, List, Optional, cast
 
 import numpy as np
@@ -65,7 +66,7 @@ def repack_series(s: pd.Series) -> pd.Series:
         for strategy in [to_int, to_float, to_category]:
             try:
                 return strategy(s)
-            except (ValueError, TypeError):
+            except (ValueError, TypeError, OverflowError):
                 continue
 
     return s
@@ -126,7 +127,7 @@ def to_float(s: pd.Series) -> pd.Series:
 def to_category(s: pd.Series) -> pd.Series:
     types = set(s.dropna().apply(type).unique())
 
-    if types.difference({str, type(None)}):
+    if types.difference({str, np.str_, dt.datetime, dt.date, type(None)}):
         raise ValueError()
 
     return s.astype("category")
diff --git a/lib/repack/tests/test_repack.py b/lib/repack/tests/test_repack.py
index 4a666a29ce7..e9596d8b05e 100644
--- a/lib/repack/tests/test_repack.py
+++ b/lib/repack/tests/test_repack.py
@@ -1,3 +1,4 @@
+import datetime as dt
 from typing import Any
 
 import numpy as np
@@ -226,3 +227,21 @@ def test_series_eq():
     a = pd.Series([1, np.nan], dtype="float64")
     b = pd.Series([1, np.nan], dtype="float64")
     assert repack.series_eq(a, b, cast=float)
+
+
+def test_repack_object_np_str():
+    s = pd.Series(["a", np.str_("b")], dtype=object)
+    v = repack.repack_series(s)
+    assert v.dtype.name == "category"
+
+
+def test_repack_with_inf():
+    s = pd.Series([0, np.inf], dtype=object)
+    v = repack.repack_series(s)
+    assert v.dtype.name == "float32"
+
+
+def test_repack_with_datetime():
+    s = pd.Series([dt.datetime.today(), dt.date.today()], dtype=object)
+    v = repack.repack_series(s)
+    assert v.dtype.name == "category"
diff --git a/tests/data_helpers/test_geo.py b/tests/data_helpers/test_geo.py
index adcf52d5a4a..b763f908018 100644
--- a/tests/data_helpers/test_geo.py
+++ b/tests/data_helpers/test_geo.py
@@ -652,9 +652,7 @@ def test_replace_region_with_one_mandatory_country_having_nan(self):
         df_in = self.df_in.copy()
 
         # Add NaN value for Country 2
-        df_in = df_in.append(
-            {"country": "Country 2", "year": 2021, "var_01": np.nan, "var_02": np.nan}, ignore_index=True
-        )
+        df_in.loc[len(df_in)] = {"country": "Country 2", "year": 2021, "var_01": np.nan, "var_02": np.nan}
 
         df = geo.add_region_aggregates(
             df=df_in,
diff --git a/tests/test_datadiff.py b/tests/test_datadiff.py
index d5f370b0083..ffeac301229 100644
--- a/tests/test_datadiff.py
+++ b/tests/test_datadiff.py
@@ -1,10 +1,10 @@
 import pandas as pd
 from owid.catalog import Dataset, DatasetMeta, Table
 
-from etl.datadiff import DatasetDiff, _data_diff
+from etl.datadiff import DatasetDiff
 
 
-def test_DatasetDiff_summary(tmp_path):
+def _create_datasets(tmp_path):
     (tmp_path / "catalog_a").mkdir()
     (tmp_path / "catalog_b").mkdir()
 
@@ -16,6 +16,12 @@ def test_DatasetDiff_summary(tmp_path):
     ds_b = Dataset.create_empty(tmp_path / "catalog_b" / "ds", ds_meta_b)
     ds_b.metadata.channel = "garden"  # type: ignore
 
+    return ds_a, ds_b
+
+
+def test_DatasetDiff_summary(tmp_path):
+    ds_a, ds_b = _create_datasets(tmp_path)
+
     tab_a = Table(pd.DataFrame({"a": [1, 2]}), short_name="tab")
     tab_a.metadata.description = "tab"
 
@@ -32,21 +38,29 @@ def test_DatasetDiff_summary(tmp_path):
     assert out == [
         "[white]= Dataset [b]garden/n/v/ds[/b]",
         "\t[yellow]~ Table [b]tab[/b] (changed [u]metadata[/u])",
-        "\t\t[yellow]~ Column [b]a[/b] (changed [u]data & metadata[/u])",
+        "\t\t[yellow]~ Column [b]a[/b] (changed [u]metadata[/u], changed [u]data[/u])",
         "\t\t[green]+ Column [b]b[/b]",
     ]
 
 
-def test_data_diff():
-    table_a = Table({"country": ["UK", "US"], "a": [1, 2]})
-    table_b = Table({"country": ["UK", "US"], "a": [1, 3]})
-    out = _data_diff(table_a, table_b, col="a", dims=["country"], tabs=0)
-    print(out)
-    assert (
-        out
-        == """
-[violet]- Changed values: 1 / 2 (50.00%)
-[violet]- country: US
-[violet]- Avg. change: 1.00 (40%)
-    """.strip()
-    )
+def test_new_data(tmp_path):
+    ds_a, ds_b = _create_datasets(tmp_path)
+
+    tab_a = Table({"country": ["UK", "US"], "a": [1, 3]}, short_name="tab")
+    tab_b = Table({"country": ["UK", "US", "FR"], "a": [1, 2, 3]}, short_name="tab")
+
+    ds_a.add(tab_a)
+    ds_b.add(tab_b)
+
+    out = []
+    differ = DatasetDiff(ds_a, ds_b, print=lambda x: out.append(x), verbose=True)
+    differ.summary()
+
+    assert out == [
+        "[white]= Dataset [b]garden/n/v/ds[/b]",
+        "\t[white]= Table [b]tab[/b]",
+        "\t\t[yellow]~ Dim [b]country[/b]",
+        "\t\t\t\t[violet]+ New values: 1 / 3 (33.33%)\n\t\t\t\t[violet]  country\n\t\t\t\t[violet]       FR",
+        "\t\t[yellow]~ Column [b]a[/b] (new [u]data[/u], changed [u]data[/u])",
+        "\t\t\t\t[violet]+ New values: 1 / 3 (33.33%)\n\t\t\t\t[violet]  country  a\n\t\t\t\t[violet]       FR  3\n\t\t\t\t[violet]~ Changed values: 1 / 3 (33.33%)\n\t\t\t\t[violet]  country  a -  a +\n\t\t\t\t[violet]       US  3.0    2",
+    ]

From 0c2c2a9573572cb2f1e7ec10bb2dd8f34ca1c0b3 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Wed, 10 Apr 2024 14:13:57 +0200
Subject: [PATCH 06/61] :tada: owidbot posts results of `etl diff` to pull
 requests (#2498)

* :tada: post datadiff to PR with owidbot
---
 apps/owidbot/__init__.py |   0
 apps/owidbot/etldiff.py  | 153 ++++++++++++++++++++++++++++++++++++
 etl/config.py            |   2 +
 etl/datadiff.py          |   8 +-
 poetry.lock              | 163 ++++++++++++++++++++++++++++++++++++++-
 pyproject.toml           |   1 +
 6 files changed, 324 insertions(+), 3 deletions(-)
 create mode 100644 apps/owidbot/__init__.py
 create mode 100644 apps/owidbot/etldiff.py

diff --git a/apps/owidbot/__init__.py b/apps/owidbot/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py
new file mode 100644
index 00000000000..e72652b5fce
--- /dev/null
+++ b/apps/owidbot/etldiff.py
@@ -0,0 +1,153 @@
+import datetime as dt
+import subprocess
+from typing import Tuple
+
+import click
+import structlog
+from github import Auth, Github
+from rich import print
+from rich.ansi import AnsiDecoder
+from rich_click.rich_command import RichCommand
+
+from etl import config
+from etl.paths import BASE_DIR
+
+log = structlog.get_logger()
+
+
+EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet"
+
+
+@click.command(name="owidbot-etl-diff", cls=RichCommand, help=__doc__)
+@click.option(
+    "--branch",
+    type=str,
+)
+@click.option(
+    "--dry-run/--no-dry-run",
+    default=False,
+    type=bool,
+    help="Print to console, do not post to Github.",
+)
+def cli(
+    branch: str,
+    dry_run: bool,
+) -> None:
+    """Post result of `etl diff` to Github PR.
+
+    Example:
+
+    ```
+    $ python apps/owidbot/etldiff.py --branch my-branch
+    ```
+    """
+    lines = call_etl_diff()
+    diff, result = format_etl_diff(lines)
+
+    body = f"""
+<details>
+
+<summary><b>etl diff</b>: {result}</summary>
+
+```diff
+{diff}
+```
+
+Automatically updated datasets matching _{EXCLUDE_DATASETS}_ are not included
+</details>
+
+_Edited: {dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")} UTC_
+    """.strip()
+
+    if dry_run:
+        print(body)
+    else:
+        post_comment_to_pr(branch, body)
+
+
+def post_comment_to_pr(branch_name: str, body: str) -> None:
+    assert config.OWIDBOT_ACCESS_TOKEN
+    auth = Auth.Token(config.OWIDBOT_ACCESS_TOKEN)
+    g = Github(auth=auth)
+
+    repo = g.get_repo("owid/etl")
+
+    # Find pull requests for the branch (assuming you're looking for open PRs)
+    pulls = repo.get_pulls(state="open", sort="created", head=f"{repo.owner.login}:{branch_name}")
+    pulls = list(pulls)
+
+    if len(pulls) == 0:
+        raise AssertionError(f"No open PR found for branch {branch_name}")
+    elif len(pulls) > 1:
+        raise AssertionError(f"More than one open PR found for branch {branch_name}")
+
+    pr = pulls[0]
+
+    comments = pr.get_issue_comments()
+
+    owidbot_comments = [comment for comment in comments if comment.user.login == "owidbot"]
+
+    if len(owidbot_comments) == 0:
+        pr.create_issue_comment(body=body)
+    elif len(owidbot_comments) == 1:
+        owidbot_comment = owidbot_comments[0]
+        owidbot_comment.edit(body=body)
+    else:
+        raise AssertionError("More than one owidbot comment found.")
+
+
+def format_etl_diff(lines: list[str]) -> Tuple[str, str]:
+    new_lines = []
+    result = ""
+    for line in lines:
+        # extract result
+        if line and line[0] in ("✅", "❌", "⚠️", "❓"):
+            result = line
+            continue
+
+        # skip some lines
+        if "this may get slow" in line or "comparison with compare" in line:
+            continue
+
+        if line.strip().startswith("-"):
+            line = "-" + line[1:]
+        if line.strip().startswith("+"):
+            line = "+" + line[1:]
+
+        new_lines.append(line)
+
+    diff = "\n".join(new_lines)
+    return diff, result
+
+
+def call_etl_diff() -> list[str]:
+    cmd = [
+        "poetry",
+        "run",
+        "etl",
+        "diff",
+        "REMOTE",
+        "data/",
+        "--include",
+        "garden",
+        "--exclude",
+        EXCLUDE_DATASETS,
+        "--verbose",
+        "--workers",
+        "3",
+    ]
+
+    result = subprocess.Popen(cmd, cwd=BASE_DIR, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = result.communicate()
+
+    stdout = stdout.decode()
+    stderr = stderr.decode()
+
+    if stderr:
+        raise Exception(f"Error: {stderr}")
+
+    return [str(line) for line in AnsiDecoder().decode(stdout)]
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/etl/config.py b/etl/config.py
index 528ba5adc15..5005ea5cef7 100644
--- a/etl/config.py
+++ b/etl/config.py
@@ -150,6 +150,8 @@ def variable_metadata_url(variable_id):
 
 OPENAI_API_KEY = env.get("OPENAI_API_KEY", None)
 
+OWIDBOT_ACCESS_TOKEN = env.get("OWIDBOT_ACCESS_TOKEN", None)
+
 
 def enable_bugsnag() -> None:
     if BUGSNAG_API_KEY:
diff --git a/etl/datadiff.py b/etl/datadiff.py
index f45008a80df..b7eaa5e97f9 100644
--- a/etl/datadiff.py
+++ b/etl/datadiff.py
@@ -1,6 +1,7 @@
 import difflib
 import os
 import re
+import traceback
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast
@@ -365,7 +366,7 @@ def cli(
     $ etl diff other-data/ data/ --include maddison
     ```
     """
-    console = Console(tab_size=2)
+    console = Console(tab_size=2, soft_wrap=True)
 
     path_to_ds_a = _load_catalog_datasets(path_a, channel, include, exclude)
     path_to_ds_b = _load_catalog_datasets(path_b, channel, include, exclude)
@@ -423,7 +424,7 @@ def func(ds_a, ds_b):
                     lines = [f"[bold red]⚠ Error: {e}[/bold red]"]
                 except Exception as e:
                     # soft fail and continue with another dataset
-                    log.error(e, exc_info=True)
+                    log.error("\n".join(traceback.format_exception(type(e), e, e.__traceback__)))
                     any_error = True
                     lines = []
                     continue
@@ -757,6 +758,9 @@ def _local_catalog_datasets(
 
         datasets += channel_datasets
 
+    # only compare public datasets
+    datasets = [ds for ds in datasets if ds.is_public]
+
     # keep only relative path of dataset
     mapping = {str(Path(ds.path).relative_to(catalog_dir)): ds for ds in datasets}
 
diff --git a/poetry.lock b/poetry.lock
index b25db6638b6..7caf0a94fcf 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1262,6 +1262,23 @@ files = [
     {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"},
 ]
 
+[[package]]
+name = "deprecated"
+version = "1.2.14"
+description = "Python @deprecated decorator to deprecate old python classes, functions or methods."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"},
+    {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"},
+]
+
+[package.dependencies]
+wrapt = ">=1.10,<2"
+
+[package.extras]
+dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
+
 [[package]]
 name = "distro"
 version = "1.8.0"
@@ -4458,6 +4475,25 @@ PyYAML = ">=3.0"
 fsspec = ["appdirs (>=1.4.3)", "fsspec (>=2021.07.0)", "funcy (>=1.14)", "tqdm (>=4.0.0)"]
 tests = ["black (==23.3.0)", "flake8", "flake8-docstrings", "funcy (>=1.14)", "importlib-resources (<6)", "pyinstaller", "pytest (>=4.6.0)", "pytest-mock", "timeout-decorator"]
 
+[[package]]
+name = "pygithub"
+version = "2.3.0"
+description = "Use the full Github API v3"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "PyGithub-2.3.0-py3-none-any.whl", hash = "sha256:65b499728be3ce7b0cd2cd760da3b32f0f4d7bc55e5e0677617f90f6564e793e"},
+    {file = "PyGithub-2.3.0.tar.gz", hash = "sha256:0148d7347a1cdeed99af905077010aef81a4dad988b0ba51d4108bf66b443f7e"},
+]
+
+[package.dependencies]
+Deprecated = "*"
+pyjwt = {version = ">=2.4.0", extras = ["crypto"]}
+pynacl = ">=1.4.0"
+requests = ">=2.14.0"
+typing-extensions = ">=4.0.0"
+urllib3 = ">=1.26.0"
+
 [[package]]
 name = "pygments"
 version = "2.16.1"
@@ -4483,6 +4519,26 @@ files = [
     {file = "pyhumps-3.8.0.tar.gz", hash = "sha256:498026258f7ee1a8e447c2e28526c0bea9407f9a59c03260aee4bd6c04d681a3"},
 ]
 
+[[package]]
+name = "pyjwt"
+version = "2.8.0"
+description = "JSON Web Token implementation in Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"},
+    {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"},
+]
+
+[package.dependencies]
+cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"crypto\""}
+
+[package.extras]
+crypto = ["cryptography (>=3.4.0)"]
+dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
+docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
+tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
+
 [[package]]
 name = "pymdown-extensions"
 version = "10.3"
@@ -4501,6 +4557,32 @@ pyyaml = "*"
 [package.extras]
 extra = ["pygments (>=2.12)"]
 
+[[package]]
+name = "pynacl"
+version = "1.5.0"
+description = "Python binding to the Networking and Cryptography (NaCl) library"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:401002a4aaa07c9414132aaed7f6836ff98f59277a234704ff66878c2ee4a0d1"},
+    {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:52cb72a79269189d4e0dc537556f4740f7f0a9ec41c1322598799b0bdad4ef92"},
+    {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a36d4a9dda1f19ce6e03c9a784a2921a4b726b02e1c736600ca9c22029474394"},
+    {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0c84947a22519e013607c9be43706dd42513f9e6ae5d39d3613ca1e142fba44d"},
+    {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06b8f6fa7f5de8d5d2f7573fe8c863c051225a27b61e6860fd047b1775807858"},
+    {file = "PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a422368fc821589c228f4c49438a368831cb5bbc0eab5ebe1d7fac9dded6567b"},
+    {file = "PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:61f642bf2378713e2c2e1de73444a3778e5f0a38be6fee0fe532fe30060282ff"},
+    {file = "PyNaCl-1.5.0-cp36-abi3-win32.whl", hash = "sha256:e46dae94e34b085175f8abb3b0aaa7da40767865ac82c928eeb9e57e1ea8a543"},
+    {file = "PyNaCl-1.5.0-cp36-abi3-win_amd64.whl", hash = "sha256:20f42270d27e1b6a29f54032090b972d97f0a1b0948cc52392041ef7831fee93"},
+    {file = "PyNaCl-1.5.0.tar.gz", hash = "sha256:8ac7448f09ab85811607bdd21ec2464495ac8b7c66d146bf545b0f08fb9220ba"},
+]
+
+[package.dependencies]
+cffi = ">=1.4.1"
+
+[package.extras]
+docs = ["sphinx (>=1.6.5)", "sphinx-rtd-theme"]
+tests = ["hypothesis (>=3.27.0)", "pytest (>=3.2.1,!=3.3.0)"]
+
 [[package]]
 name = "pyopenssl"
 version = "23.2.0"
@@ -7388,6 +7470,85 @@ cachetools = "*"
 pandas = "*"
 requests = "*"
 
+[[package]]
+name = "wrapt"
+version = "1.16.0"
+description = "Module for decorators, wrappers and monkey patching."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"},
+    {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"},
+    {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"},
+    {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"},
+    {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"},
+    {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"},
+    {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"},
+    {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"},
+    {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"},
+    {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"},
+    {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"},
+    {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"},
+    {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"},
+    {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"},
+    {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"},
+    {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"},
+    {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"},
+    {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"},
+    {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"},
+    {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"},
+    {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"},
+    {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"},
+    {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"},
+    {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"},
+    {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"},
+    {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"},
+    {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"},
+    {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"},
+    {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"},
+    {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"},
+    {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"},
+    {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"},
+    {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"},
+    {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"},
+]
+
 [[package]]
 name = "wsproto"
 version = "1.2.0"
@@ -7445,4 +7606,4 @@ test = ["pytest", "pytest-cov"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.12"
-content-hash = "19fe7f0b8f32bcf844488bfe75e1f6302a7e4278bb1b461f45bec30efa2efd6a"
+content-hash = "6604150041608aef717982c0ecfd530d14a1f5dec4c0d2bb0582124ca90fce20"
diff --git a/pyproject.toml b/pyproject.toml
index 5431762f90f..6e67f75dced 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,7 @@ cdsapi = "^0.6.1"
 rioxarray = "^0.15.1"
 tiktoken = "^0.6.0"
 html2text = "^2020.1.16"
+pygithub = "^2.3.0"
 
 [tool.poetry.group.api.dependencies]
 fastapi = "^0.109.0"

From 11bf83bd19e14fcba3b153af58a63bc6b1b423c8 Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Wed, 10 Apr 2024 17:20:25 +0200
Subject: [PATCH 07/61] =?UTF-8?q?=F0=9F=90=9B=20set=20first=20year=20to=20?=
 =?UTF-8?q?baseline=20of=20=E2=80=931=20for=20all=20benchmarks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set the first year's performance to the baseline of –1 for each benchmark. This is to preserve a baseline for –1 for all benchmarks, even when a second, better performance is recorded in a later year.
---
 .../garden/artificial_intelligence/2024-04-02/dynabench.py  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py
index 208d80198a5..c0d48e24e02 100644
--- a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py
+++ b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py
@@ -24,6 +24,12 @@ def run(dest_dir: str) -> None:
     # Selecting the best performance for each benchmark per year
     tb = tb.groupby(["benchmark", "year"])["performance"].max().reset_index().copy_metadata(from_table=tb)
 
+    # Set the first year's performance to the baseline of –1 for each benchmark.
+    # This is to preserve a baseline for –1 for all benchmarks,
+    # even when a second, better performance is recorded in a later year.
+    tb = tb.sort_values(by=["benchmark", "year"])
+    tb.loc[tb.groupby("benchmark").head(1).index, "performance"] = -1
+
     mapping = {
         "MNIST": "Handwriting recognition",
         "Switchboard": "Speech recognition",

From 08350d2326f8861aa5a4108abcc8ff3230c6a123 Mon Sep 17 00:00:00 2001
From: Edouard Mathieu <edmat@pm.me>
Date: Wed, 10 Apr 2024 17:31:48 +0200
Subject: [PATCH 08/61] =?UTF-8?q?=E2=9C=A8=20dynabench:=20disable=20downlo?=
 =?UTF-8?q?ad=20button=20for=20chart?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Given the transformation we apply in 11bf83bd19e14fcba3b153af58a63bc6b1b423c8, Max and I think it's wiser to disable the download feature for this chart, as the downloadable data doesn't reflect the provider's data.
---
 .../garden/artificial_intelligence/2024-04-02/dynabench.meta.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml
index 33ca7d1a1ad..89456c2164a 100644
--- a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml
+++ b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml
@@ -27,6 +27,7 @@ definitions:
 # http://docs.owid.io/projects/etl/architecture/metadata/reference/
 dataset:
   update_period_days: 365
+  non_redistributable: true
 
 tables:
   dynabench:

From 2509197f5695b5c674940cff3bdf28f894c9048a Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Wed, 10 Apr 2024 21:05:48 +0200
Subject: [PATCH 09/61] =?UTF-8?q?=F0=9F=90=9D=20Update=20pandas=20to=202.2?=
 =?UTF-8?q?=20(#2468)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 🐝 Update pandas to 2.2
---
 etl/config.py                                 |   7 +-
 .../2023-06-08/yougov_end_of_humanity.py      |   1 +
 .../2023-06-14/ai_national_strategy.py        |   4 +-
 .../2023-06-26/ai_wrp_2021.py                 |   4 +-
 .../data/garden/democracy/2024-03-07/bmr.py   |   2 +-
 .../education/2023-07-17/education_lee_lee.py |  18 +-
 .../garden/education/2023-07-17/shared.py     |   8 +-
 .../2024-02-26/gdp_and_co2_decoupling.py      |   1 +
 .../latest/xm_karlinsky_kobak.py              |   4 +-
 .../data/garden/faostat/2022-05-17/shared.py  |   4 +-
 .../data/garden/faostat/2023-02-22/shared.py  |   4 +-
 .../data/garden/faostat/2023-06-12/shared.py  |   4 +-
 .../faostat/2024-03-14/faostat_metadata.py    |   4 +-
 .../data/garden/faostat/2024-03-14/shared.py  |  11 +-
 .../2023-09-18/colonial_dates_dataset.py      |   3 +
 .../garden/homicide/2023-01-03/who_mort_db.py |   4 +-
 .../2023-06-14/prevalence_dalys_world.py      |   2 +-
 .../plastic_waste/2023-09-26/geyer_2017.py    |   5 +-
 .../data/garden/tourism/2023-05-05/unwto.py   |   6 +-
 .../2023-11-27/unhlm_commitments.py           |   2 +-
 .../urban_agglomerations_definition_count.py  |   4 +-
 .../2024-01-09/nuclear_weapons_inventories.py |   2 +-
 .../status_of_world_nuclear_forces.py         |   3 +-
 .../war/2024-01-25/nuclear_weapons_tests.py   |   2 +-
 .../data/garden/wb/2021-07-01/wb_income.ipynb | 205 ++++++++++--
 .../who/2023-04-03/flu_vaccine_policy.py      |   8 +-
 etl/steps/data/garden/who/2024-01-03/gho.py   |   7 +-
 .../garden/wvs/2023-06-25/longitudinal_wvs.py |   3 +
 .../surface_temperature_anomalies.py          |   1 -
 .../2023-12-20/surface_temperature_monthly.py |   1 -
 .../2023-06-07/monmouth_poll.py               |   2 +-
 .../2023-06-08/yougov_end_of_humanity.py      |   6 +-
 .../2023-06-08/yougov_jobs.py                 |   2 +-
 .../climate/2024-01-28/global_sea_level.py    |   2 +-
 .../health/2023-05-04/global_wellbeing.py     |   8 +-
 .../oecd/2023-05-18/co2_air_transport.py      |   2 +-
 .../data/meadow/tourism/2023-05-05/unwto.py   |   5 +-
 .../tourism/2023-05-09/unwto_environment.py   |   1 +
 lib/catalog/owid/catalog/tables.py            | 119 ++++---
 lib/catalog/poetry.lock                       | 117 ++++---
 lib/catalog/pyproject.toml                    |   2 +-
 lib/catalog/tests/test_tables.py              |  14 +
 lib/datautils/owid/datautils/dataframes.py    |   4 +-
 lib/datautils/poetry.lock                     | 236 ++++++++------
 lib/datautils/pyproject.toml                  |   4 +-
 lib/datautils/tests/test_dataframes.py        |  12 +-
 lib/repack/poetry.lock                        | 153 +++++++--
 lib/repack/pyproject.toml                     |   4 +-
 lib/walden/poetry.lock                        |  48 +--
 poetry.lock                                   | 307 ++++++++++--------
 pyproject.toml                                |   3 +-
 51 files changed, 906 insertions(+), 479 deletions(-)

diff --git a/etl/config.py b/etl/config.py
index 5005ea5cef7..d73719b985c 100644
--- a/etl/config.py
+++ b/etl/config.py
@@ -12,6 +12,7 @@
 from os import environ as env
 
 import bugsnag
+import pandas as pd
 from dotenv import load_dotenv
 
 from etl.paths import BASE_DIR
@@ -30,6 +31,10 @@ def load_env():
 
 
 load_env()
+
+
+pd.set_option("future.no_silent_downcasting", True)
+
 # When DEBUG is on
 # - run steps in the same process (speeding up ETL)
 DEBUG = env.get("DEBUG") in ("True", "true", "1")
@@ -131,7 +136,7 @@ def variable_metadata_url(variable_id):
 MAX_VIRTUAL_MEMORY_LINUX = 32 * 2**30  # 32 GB
 
 # increment this to force a full rebuild of all datasets
-ETL_EPOCH = 4
+ETL_EPOCH = 5
 
 # any garden or grapher dataset after this date will have strict mode enabled
 STRICT_AFTER = "2023-06-25"
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py b/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py
index ac765801239..8e4756f09ec 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py
@@ -61,6 +61,7 @@ def run(dest_dir: str) -> None:
 
     # Transform the 'melted_df_all_age_groups' dataframe into a pivot table with 'options' as index and
     # each unique value in 'melted_columns' as a column. Store the pivot table in 'pivot_df_all_age_groups'.
+    melted_df_all_age_groups = melted_df_all_age_groups.astype({"melted_columns": "category"})
     pivot_df_all_age_groups = melted_df_all_age_groups.pivot_table(
         index=["options"], columns="melted_columns", values="value"
     )
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
index 9fa62b488c5..ad708f5e3cb 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
@@ -58,10 +58,10 @@ def run(dest_dir: str) -> None:
         # Check if any year for the current country is not NaN
         if not group["released_national_strategy_on_ai"].isna().all():
             # Forward fill NaN values after "Released"
-            group["released_national_strategy_on_ai"].fillna(method="ffill", inplace=True)
+            group["released_national_strategy_on_ai"] = group["released_national_strategy_on_ai"].fillna(method="ffill")
 
         # Fill remaining NaN values with "Not Released"
-        group["released_national_strategy_on_ai"].fillna("Not released", inplace=True)
+        group["released_national_strategy_on_ai"] = group["released_national_strategy_on_ai"].fillna("Not released")
         df_merged.loc[group.index] = group
     df_merged.drop("released", axis=1, inplace=True)
     tb = Table(df_merged, short_name=paths.short_name, underscore=True)
diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py
index bfe7cd4e1c7..68e158aa09f 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py
@@ -154,7 +154,7 @@ def calculate_percentage(df, column, valid_responses_dict, column_to_split_by):
     df_filtered = df[[column_to_split_by, "year", column]][valid_responses].reset_index(drop=True)
 
     # Group by country and year
-    grouped = df_filtered.groupby([column_to_split_by, "year"])
+    grouped = df_filtered.groupby([column_to_split_by, "year"], observed=True)
 
     # Count valid responses
     counts = grouped[column].value_counts().reset_index(name="count")
@@ -343,7 +343,7 @@ def pivot_by_category(df, question):
     # Iterate over each pivot column
     for pivot_col in cols_pivot:
         # Pivot the dataframe for the current pivot column
-        pivoted_df = pd.pivot_table(df, values=question, index=["country", "year"], columns=pivot_col)
+        pivoted_df = pd.pivot_table(df, values=question, index=["country", "year"], columns=pivot_col, observed=True)
         # Append the pivot table to the list
         pivot_tables.append(pivoted_df)
 
diff --git a/etl/steps/data/garden/democracy/2024-03-07/bmr.py b/etl/steps/data/garden/democracy/2024-03-07/bmr.py
index 578cd9c96f8..74aa650c352 100644
--- a/etl/steps/data/garden/democracy/2024-03-07/bmr.py
+++ b/etl/steps/data/garden/democracy/2024-03-07/bmr.py
@@ -274,7 +274,7 @@ def add_imputes(tb: Table) -> Table:
     tb = concat(tb_imputed + [tb], ignore_index=True)
 
     # Set to False by default (for non-imputed countries)
-    tb["regime_imputed"] = tb["regime_imputed"].fillna(False)
+    tb["regime_imputed"] = tb["regime_imputed"].fillna(False).astype(bool)
 
     # Re-order columns
     cols = [
diff --git a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py
index 01dd3c7918a..4c73aaa3796 100644
--- a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py
+++ b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py
@@ -74,13 +74,17 @@ def run(dest_dir: str) -> None:
     tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
 
     # Replace age group values with descriptive labels
-    tb["age_group"] = tb["age_group"].replace(
-        {
-            "15.0-64.0": "Youth and Adults (15-64 years)",
-            "15.0-24.0": "Youth (15-24 years)",
-            "25.0-64.0": "Adults (25-64 years)",
-            "not specified": "Age not specified",
-        }
+    tb["age_group"] = (
+        tb["age_group"]
+        .astype(str)
+        .replace(
+            {
+                "15.0-64.0": "Youth and Adults (15-64 years)",
+                "15.0-24.0": "Youth (15-24 years)",
+                "25.0-64.0": "Adults (25-64 years)",
+                "not specified": "Age not specified",
+            }
+        )
     )
 
     # Prepare enrollment and attainment data
diff --git a/etl/steps/data/garden/education/2023-07-17/shared.py b/etl/steps/data/garden/education/2023-07-17/shared.py
index 8db6ff57962..ce0998d945d 100644
--- a/etl/steps/data/garden/education/2023-07-17/shared.py
+++ b/etl/steps/data/garden/education/2023-07-17/shared.py
@@ -135,7 +135,11 @@ def add_region_aggregates_education(
             def weighted_mean(x, w):
                 values = np.ma.masked_invalid(x.astype("float64"))
                 weights = np.ma.masked_invalid(w.astype("float64"))
-                return np.ma.average(values, weights=weights)
+                out = np.ma.average(values, weights=weights)
+                if np.ma.is_masked(out):
+                    return np.nan
+                else:
+                    return out
 
             # Create a closure to define variable_agg with specific weights
             def make_weighted_mean(weights):
@@ -149,7 +153,7 @@ def variable_agg(x):
         else:
             variable_agg = aggregations[variable]
 
-        aggs[variable] = variable_agg
+        aggs[variable] = variable_agg  # type: ignore
 
     df_region = groupby_agg(
         df=df_countries,
diff --git a/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py b/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py
index 98fd7cb5b14..9f9142fdd66 100644
--- a/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py
+++ b/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py
@@ -9,6 +9,7 @@
 TODO: Include link to the updated static chart once it is created.
 
 """
+
 from structlog import get_logger
 
 from etl.helpers import PathFinder, create_dataset
diff --git a/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py b/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py
index 380500d9c2f..4a940cfef89 100644
--- a/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py
+++ b/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py
@@ -262,7 +262,9 @@ def add_uk_by_age(df: pd.DataFrame):
     time_units = df_uk["time_unit"].unique()
     assert len(time_units) == 1, "There are multiple time units for UK Nations"
     # Estimate metrics
-    df_uk = df_uk.groupby(["year", "time", "age"], as_index=False).sum(min_count=3)
+    df_uk = (
+        df_uk.drop(columns=["entity", "time_unit"]).groupby(["year", "time", "age"], as_index=False).sum(min_count=3)
+    )
     # Reassign entity name and time unit
     df_uk["entity"] = "United Kingdom"
     df_uk["time_unit"] = time_units[0]
diff --git a/etl/steps/data/garden/faostat/2022-05-17/shared.py b/etl/steps/data/garden/faostat/2022-05-17/shared.py
index 2422e17b7c4..d7fb893e2cc 100644
--- a/etl/steps/data/garden/faostat/2022-05-17/shared.py
+++ b/etl/steps/data/garden/faostat/2022-05-17/shared.py
@@ -1366,7 +1366,7 @@ def convert_variables_given_per_capita_to_total_value(
     # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into
     # total (non-per-capita) values.
     element_codes_that_were_per_capita = list(
-        elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique()
+        elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique()
     )
     if len(element_codes_that_were_per_capita) > 0:
         data = data.copy()
@@ -1417,7 +1417,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame
 
     # Find element codes that have to be made per capita.
     element_codes_to_make_per_capita = list(
-        elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique()
+        elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique()
     )
     if len(element_codes_to_make_per_capita) > 0:
         log.info("add_per_capita_variables", shape=data.shape)
diff --git a/etl/steps/data/garden/faostat/2023-02-22/shared.py b/etl/steps/data/garden/faostat/2023-02-22/shared.py
index 6f18800d737..120f7f476df 100644
--- a/etl/steps/data/garden/faostat/2023-02-22/shared.py
+++ b/etl/steps/data/garden/faostat/2023-02-22/shared.py
@@ -1304,7 +1304,7 @@ def convert_variables_given_per_capita_to_total_value(
     # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into
     # total (non-per-capita) values.
     element_codes_that_were_per_capita = list(
-        elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique()
+        elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique()
     )
     if len(element_codes_that_were_per_capita) > 0:
         data = data.copy()
@@ -1355,7 +1355,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame
 
     # Find element codes that have to be made per capita.
     element_codes_to_make_per_capita = list(
-        elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique()
+        elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique()
     )
     if len(element_codes_to_make_per_capita) > 0:
         log.info("add_per_capita_variables", shape=data.shape)
diff --git a/etl/steps/data/garden/faostat/2023-06-12/shared.py b/etl/steps/data/garden/faostat/2023-06-12/shared.py
index 1953069445b..9c6774e9f77 100644
--- a/etl/steps/data/garden/faostat/2023-06-12/shared.py
+++ b/etl/steps/data/garden/faostat/2023-06-12/shared.py
@@ -1314,7 +1314,7 @@ def convert_variables_given_per_capita_to_total_value(
     # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into
     # total (non-per-capita) values.
     element_codes_that_were_per_capita = list(
-        elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique()
+        elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique()
     )
     if len(element_codes_that_were_per_capita) > 0:
         data = data.copy()
@@ -1365,7 +1365,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame
 
     # Find element codes that have to be made per capita.
     element_codes_to_make_per_capita = list(
-        elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique()
+        elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique()
     )
     if len(element_codes_to_make_per_capita) > 0:
         log.info("add_per_capita_variables", shape=data.shape)
diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py
index 17514fea34c..03c0c45e48b 100644
--- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py
+++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py
@@ -498,7 +498,9 @@ def create_elements_table_for_domain(table: Table, metadata: Dataset, dataset_sh
         .sort_values(["fao_unit_short_name"])
         .reset_index(drop=True)
     )
-    elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna(elements_from_data["fao_unit_short_name"])
+    elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna(
+        elements_from_data["fao_unit_short_name"].astype(object)
+    )
 
     # Sanity checks:
 
diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py
index ec239660fea..9377889c115 100644
--- a/etl/steps/data/garden/faostat/2024-03-14/shared.py
+++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py
@@ -974,8 +974,11 @@ def remove_overlapping_data_between_historical_regions_and_successors(
             columns
         ].drop_duplicates()
         # Find unique years where the above combinations of item-element-years of region and successors overlap.
-        overlapping_years = pr.concat([historical_region_years, historical_successors_years], ignore_index=True)
-        overlapping_years = overlapping_years[overlapping_years.duplicated()]
+        if historical_region_years.empty and historical_successors_years.empty:
+            overlapping_years = pd.DataFrame()
+        else:
+            overlapping_years = pr.concat([historical_region_years, historical_successors_years], ignore_index=True)
+            overlapping_years = overlapping_years[overlapping_years.duplicated()]
         if not overlapping_years.empty:
             log.warning(
                 f"Removing rows where historical region {historical_region} overlaps with its successors "
@@ -1298,7 +1301,7 @@ def convert_variables_given_per_capita_to_total_value(tb: Table, elements_metada
     # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into
     # total (non-per-capita) values.
     element_codes_that_were_per_capita = list(
-        elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique()
+        elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique()
     )
     if len(element_codes_that_were_per_capita) > 0:
         tb = tb.copy()
@@ -1349,7 +1352,7 @@ def add_per_capita_variables(tb: Table, elements_metadata: Table) -> Table:
 
     # Find element codes that have to be made per capita.
     element_codes_to_make_per_capita = list(
-        elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique()
+        elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique()
     )
     if len(element_codes_to_make_per_capita) > 0:
         log.info("add_per_capita_variables", shape=tb_with_pc_variables.shape)
diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py
index f2b706763db..632347f3ea9 100644
--- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py
+++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py
@@ -182,6 +182,9 @@ def regional_aggregations(tb: Table, tb_pop: Table) -> Table:
     # Define non-colonies identifiers for `colonizer`
     non_colonies = ["zz. Colonizer", "zzz. Not colonized", "zzzz. No longer colonized"]
 
+    # Backwards compatibility
+    tb_regions["colonizer"] = tb_regions["colonizer"].astype(object).fillna(np.nan)
+
     # Define colony_number, which is 1 if countries are not in non_colonies and colony_pop, which is the product of colony and population
     tb_regions["colony_number"] = tb_regions["colonizer"].apply(lambda x: 0 if x in non_colonies else 1)
     tb_regions["colony_pop"] = tb_regions["population"] * tb_regions["colony_number"]
diff --git a/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py b/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py
index 5131c53c201..0e5ed5f2577 100644
--- a/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py
+++ b/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py
@@ -48,7 +48,7 @@ def run(dest_dir: str) -> None:
     ds_meadow = Dataset(DATA_DIR / "meadow/homicide/2023-01-03/who_mort_db")
     tb_meadow = ds_meadow["who_mort_db"]
 
-    df = pd.DataFrame(tb_meadow)
+    df = pd.DataFrame(tb_meadow).astype({"number_of_deaths": float})
 
     log.info("who_mort_db.exclude_countries")
     df = exclude_countries(df)
@@ -92,7 +92,7 @@ def run(dest_dir: str) -> None:
 def clean_up_dimensions(df: pd.DataFrame) -> pd.DataFrame:
     sex_dict = {"All": "Both Sexes", "Male": "Males", "Female": "Females", "Unknown": "Unknown sex"}
     age_dict = {"Age_all": "All ages", "Age_unknown": "Unknown age"}
-    df = df.replace({"sex": sex_dict, "age_group_code": age_dict})
+    df = df.astype({"sex": str, "age_group_code": str}).replace({"sex": sex_dict, "age_group_code": age_dict})
 
     return df
 
diff --git a/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py b/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py
index 38473bb5aba..5c3ec2b8b5d 100644
--- a/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py
+++ b/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py
@@ -102,7 +102,7 @@ def make_table_prevalence(ds: Dataset) -> Table:
         "share_eating_disorders": "Eating disorders",
         "share_schizophrenia_disorders": "Schizophrenia",
     }
-    tb = tb.rename(columns=column_rename)[set(column_rename.values()) | {"year"}]
+    tb = tb.rename(columns=column_rename)[list(set(column_rename.values()) | {"year"})]
     # Unpivot
     tb = tb.melt(id_vars=["year"], var_name="cause", value_name="share_rate")
 
diff --git a/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py b/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py
index 6ef195fa15b..edd8db37ee3 100644
--- a/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py
+++ b/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py
@@ -1,5 +1,6 @@
 """Load a meadow dataset and create a garden dataset."""
 import owid.catalog.processing as pr
+import pandas as pd
 
 from etl.helpers import PathFinder, create_dataset
 
@@ -27,8 +28,8 @@ def run(dest_dir: str) -> None:
     for year in range(2016, 2019):  # 2019 is the stop value and is not included
         last_value = tb.loc[tb.index[-1], "plastic_production"]  # Getting the last value in the 'Value' column
         new_value = last_value * (1 + growth_rate)  # Calculating the value for the new year
-        new_row = {"country": "World", "year": year, "plastic_production": new_value}  # Creating a new row
-        tb = tb.append(new_row, ignore_index=True)  # Adding the new row to the DataFrame
+        new_row = pd.Series({"country": "World", "year": year, "plastic_production": new_value})  # Creating a new row
+        tb.loc[len(tb)] = new_row
     tb["plastic_production"] = tb["plastic_production"] * 1e6  # Convert to millions
 
     # Add data from OECD for 2019
diff --git a/etl/steps/data/garden/tourism/2023-05-05/unwto.py b/etl/steps/data/garden/tourism/2023-05-05/unwto.py
index 54c1a2b5dbf..36208493c09 100644
--- a/etl/steps/data/garden/tourism/2023-05-05/unwto.py
+++ b/etl/steps/data/garden/tourism/2023-05-05/unwto.py
@@ -63,14 +63,14 @@ def run(dest_dir: str) -> None:
     merged_df_drop_ = merged_df.loc[~merged_df.country.isin(["Saba", "Sint Eustatius", "Bonaire"])]
     # Concatenate 'merged_df_drop_' and 'sum_bon_sint_saba' into a single DataFrame 'merged_df_concat'.
     # The rows of 'sum_bon_sint_saba' will be appended to 'merged_df_drop_'.
-    merged_df_concat = merged_df_drop_.append(sum_bon_sint_saba, ignore_index=True)
+    merged_df_concat = pd.concat([merged_df_drop_, sum_bon_sint_saba], ignore_index=True)
 
     # Set index, check that it's unique and reset index
-    assert not merged_df_concat[["country", "year"]].duplicated().any(), "Index is not well constructed"
+    assert not merged_df_concat[["country", "year"]].duplicated().any(), "Index is not well constructed"  # type: ignore
 
     # Aggregate data by region (decided not to do for now)
     # Africa, Oceania, and income level categories
-    # regions_ = ["North America",
+    ## regions_ = ["North America",
     #     "South America",
     #     "Europe",
     #     "Africa",
diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py b/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py
index 9a2ea51f26d..ae7a4784f82 100644
--- a/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py
+++ b/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py
@@ -76,7 +76,7 @@ def add_meaning_to_codes(tb: Table) -> Table:
         "min_tra_collab",
     ]
 
-    tb[cols_0_1_3] = tb[cols_0_1_3].astype("category").replace({0: "No", 1: "Yes", 3: "Don't know"})
+    tb[cols_0_1_3] = tb[cols_0_1_3].astype(object).replace({0: "No", 1: "Yes", 3: "Don't know"}).astype("category")
     tb[cols_other] = (
         tb[cols_other]
         .astype("object")
diff --git a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py
index 104a3d08220..66c12efbfd0 100644
--- a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py
+++ b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py
@@ -35,7 +35,9 @@ def run(dest_dir: str) -> None:
     # Add a 'year' column filled with 2018
     df_counts["year"] = 2018
 
-    df_counts["countries"] = df_counts["countries"].apply(lambda x: f"{x:,} inhabitants" if isinstance(x, int) else x)
+    df_counts["countries"] = (
+        df_counts["countries"].astype(object).apply(lambda x: f"{x:,} inhabitants" if isinstance(x, int) else x)
+    )
 
     # Replace '<NA>' values in the 'countries' column with 'No minimum population threshold'
     df_counts["countries"] = df_counts["countries"].astype(str).replace("<NA>", "No minimum population threshold")
diff --git a/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py b/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py
index 7f7c870a0aa..c5c662fd39c 100644
--- a/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py
+++ b/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py
@@ -29,7 +29,7 @@ def run(dest_dir: str) -> None:
     tb = tb.rename(columns=COLUMNS, errors="raise")
 
     # Looking at the original dashboards, it seems that missing values are shown as zeros.
-    tb = tb.fillna(0)
+    tb["number_of_warheads"] = tb["number_of_warheads"].fillna(0)
 
     # Harmonize country names.
     tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
diff --git a/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py b/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py
index 671d20ecf9d..a5a5bcd91d0 100644
--- a/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py
+++ b/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py
@@ -38,7 +38,8 @@ def run(dest_dir: str) -> None:
 
     # Looking at the original dashboard, it seems that missing values are shown as zeros.
     # https://public.tableau.com/app/profile/kate.kohn/viz/EstimatedGlobalNuclearWarheadInventories2021/Dashboard1
-    tb = tb.fillna(0)
+    cols = [c for c in tb.columns if c not in ["country", "year"]]
+    tb[cols] = tb[cols].fillna(0)
 
     # Harmonize country names.
     tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
diff --git a/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py b/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py
index 8941f0c9885..554eafe4d68 100644
--- a/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py
+++ b/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py
@@ -35,7 +35,7 @@ def run(dest_dir: str) -> None:
     # Process data.
     #
     # By looking at the original table, it seems clear that empty cells mean zero.
-    tb = tb.fillna(0)
+    tb = tb.astype(object).fillna(0)
 
     # Temporarily convert all columns to string (to avoid issues with categorical variables).
     tb = tb.astype(str)
diff --git a/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb b/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb
index 2c7a7bc580d..69a65f20899 100644
--- a/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb
+++ b/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb
@@ -86,10 +86,20 @@
    "id": "e001fe46",
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/plain": [
-       "PosixPath('/Users/mojmir/projects/etl/data/meadow/wb/2021-07-01/wb_income')"
+       "\u001b[1;35mPosixPath\u001b[0m\u001b[1m(\u001b[0m\u001b[32m'/Users/mojmir/projects/etl2/data/meadow/wb/2021-07-01/wb_income'\u001b[0m\u001b[1m)\u001b[0m"
       ]
      },
      "execution_count": 4,
@@ -103,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 5,
    "id": "134ea32a-77b4-4e4c-af5c-400f6edd5866",
    "metadata": {},
    "outputs": [],
@@ -114,17 +124,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 6,
    "id": "24c738cd",
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "data": {
       "text/plain": [
-       "['wb_income_group']"
+       "\u001b[1m[\u001b[0m\u001b[32m'wb_income_group'\u001b[0m\u001b[1m]\u001b[0m"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -135,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 7,
    "id": "5553eb58-fd10-4a93-9356-859121b7bed0",
    "metadata": {
     "tags": []
@@ -148,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 8,
    "id": "e9a67fe4-ca1e-4e73-b667-6cef8cc573b2",
    "metadata": {},
    "outputs": [
@@ -162,7 +182,20 @@
     {
      "data": {
       "text/html": [
-       "<div>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "             <h2 style=\"margin-bottom: 0em\"><pre>wb_income_group</pre></h2>\n",
+       "             <p style=\"font-variant: small-caps; font-size: 1.5em; font-family: sans-serif; color: grey; margin-top: -0.2em; margin-bottom: 0.2em\">table</p>\n",
+       "             <div>\n",
        "<style scoped>\n",
        "    .dataframe tbody tr th:only-of-type {\n",
        "        vertical-align: middle;\n",
@@ -238,9 +271,11 @@
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "</div>"
+       "</div>\n",
+       "        "
       ],
       "text/plain": [
+       "\n",
        "            code                     region         income_group  \\\n",
        "economy                                                            \n",
        "Aruba        ABW  Latin America & Caribbean          High income   \n",
@@ -258,7 +293,7 @@
        "Andorra                  NaN               NaN  "
       ]
      },
-     "execution_count": 17,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -286,7 +321,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 9,
    "id": "1f8e667b-0cd7-4501-b9f1-a6b2e9c597ea",
    "metadata": {},
    "outputs": [],
@@ -297,17 +332,106 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 14,
    "id": "19529785-4a6f-4323-8f10-3ef5f3d44f45",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/var/folders/0s/2yqr44dj44zcmyzdrf8fvxyc0000gn/T/ipykernel_18005/1254038564.py:7: FutureWarning: Passing a dict as an indexer is deprecated and will raise in a future version. Use a list instead.\n",
-      "  df = df[column_keep_rename].rename(columns=column_keep_rename)\n"
-     ]
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">&lt;module&gt;</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">7</span>                                                                                    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">4 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   </span><span style=\"color: #808000; text-decoration-color: #808000\">\"income_group\"</span>: <span style=\"color: #808000; text-decoration-color: #808000\">\"income_group\"</span>,                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">5 </span>}                                                                                            <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6 </span>df = df.reset_index()                                                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>7 df = df[column_keep_rename.keys()].rename(columns=column_keep_rename)                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">8 </span>                                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/Users/mojmir/projects/etl2/.venv/lib/python3.10/site-packages/pandas/core/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">frame.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">4096</span> in      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">__getitem__</span>                                                                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 4093 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>:                                                                            <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 4094 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> is_iterator(key):                                                         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 4095 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   </span>key = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">list</span>(key)                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 4096 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span>indexer = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.columns._get_indexer_strict(key, <span style=\"color: #808000; text-decoration-color: #808000\">\"columns\"</span>)[<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1</span>]                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 4097 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 4098 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># take() does not accept boolean indexers</span>                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 4099 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">getattr</span>(indexer, <span style=\"color: #808000; text-decoration-color: #808000\">\"dtype\"</span>, <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>) == <span style=\"color: #00ffff; text-decoration-color: #00ffff\">bool</span>:                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/Users/mojmir/projects/etl2/.venv/lib/python3.10/site-packages/pandas/core/indexes/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">base.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">6200</span>  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">_get_indexer_strict</span>                                                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6197 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>:                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6198 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span>keyarr, indexer, new_indexer = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>._reindex_non_unique(keyarr)               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6199 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>6200 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>._raise_if_missing(keyarr, indexer, axis_name)                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6201 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6202 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>keyarr = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.take(indexer)                                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6203 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">isinstance</span>(key, Index):                                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/Users/mojmir/projects/etl2/.venv/lib/python3.10/site-packages/pandas/core/indexes/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">base.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">6252</span>  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">_raise_if_missing</span>                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6249 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">KeyError</span>(<span style=\"color: #808000; text-decoration-color: #808000\">f\"None of [{</span>key<span style=\"color: #808000; text-decoration-color: #808000\">}] are in the [{</span>axis_name<span style=\"color: #808000; text-decoration-color: #808000\">}]\"</span>)               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6250 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span>                                                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6251 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span>not_found = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">list</span>(ensure_index(key)[missing_mask.nonzero()[<span style=\"color: #0000ff; text-decoration-color: #0000ff\">0</span>]].unique())       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>6252 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">KeyError</span>(<span style=\"color: #808000; text-decoration-color: #808000\">f\"{</span>not_found<span style=\"color: #808000; text-decoration-color: #808000\">} not in index\"</span>)                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6253 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   </span>                                                                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6254 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   </span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff; font-weight: bold\">@overload</span>                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">6255 </span><span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">_get_indexer_non_comparable</span>(                                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">KeyError: </span><span style=\"color: #008000; text-decoration-color: #008000\">\"['economy'] not in index\"</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+       "\u001b[31m│\u001b[0m in \u001b[92m<module>\u001b[0m:\u001b[94m7\u001b[0m                                                                                    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m4 \u001b[0m\u001b[2m│   \u001b[0m\u001b[33m\"\u001b[0m\u001b[33mincome_group\u001b[0m\u001b[33m\"\u001b[0m: \u001b[33m\"\u001b[0m\u001b[33mincome_group\u001b[0m\u001b[33m\"\u001b[0m,                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m5 \u001b[0m}                                                                                            \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6 \u001b[0mdf = df.reset_index()                                                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m7 df = df[column_keep_rename.keys()].rename(columns=column_keep_rename)                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m8 \u001b[0m                                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/Users/mojmir/projects/etl2/.venv/lib/python3.10/site-packages/pandas/core/\u001b[0m\u001b[1;33mframe.py\u001b[0m:\u001b[94m4096\u001b[0m in      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[92m__getitem__\u001b[0m                                                                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 4093 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94melse\u001b[0m:                                                                            \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 4094 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mif\u001b[0m is_iterator(key):                                                         \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 4095 \u001b[0m\u001b[2m│   │   │   │   \u001b[0mkey = \u001b[96mlist\u001b[0m(key)                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 4096 \u001b[2m│   │   │   \u001b[0mindexer = \u001b[96mself\u001b[0m.columns._get_indexer_strict(key, \u001b[33m\"\u001b[0m\u001b[33mcolumns\u001b[0m\u001b[33m\"\u001b[0m)[\u001b[94m1\u001b[0m]                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 4097 \u001b[0m\u001b[2m│   │   \u001b[0m                                                                                 \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 4098 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[2m# take() does not accept boolean indexers\u001b[0m                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 4099 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[96mgetattr\u001b[0m(indexer, \u001b[33m\"\u001b[0m\u001b[33mdtype\u001b[0m\u001b[33m\"\u001b[0m, \u001b[94mNone\u001b[0m) == \u001b[96mbool\u001b[0m:                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/Users/mojmir/projects/etl2/.venv/lib/python3.10/site-packages/pandas/core/indexes/\u001b[0m\u001b[1;33mbase.py\u001b[0m:\u001b[94m6200\u001b[0m  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m in \u001b[92m_get_indexer_strict\u001b[0m                                                                           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6197 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94melse\u001b[0m:                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6198 \u001b[0m\u001b[2m│   │   │   \u001b[0mkeyarr, indexer, new_indexer = \u001b[96mself\u001b[0m._reindex_non_unique(keyarr)               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6199 \u001b[0m\u001b[2m│   │   \u001b[0m                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m6200 \u001b[2m│   │   \u001b[0m\u001b[96mself\u001b[0m._raise_if_missing(keyarr, indexer, axis_name)                                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6201 \u001b[0m\u001b[2m│   │   \u001b[0m                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6202 \u001b[0m\u001b[2m│   │   \u001b[0mkeyarr = \u001b[96mself\u001b[0m.take(indexer)                                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6203 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[96misinstance\u001b[0m(key, Index):                                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/Users/mojmir/projects/etl2/.venv/lib/python3.10/site-packages/pandas/core/indexes/\u001b[0m\u001b[1;33mbase.py\u001b[0m:\u001b[94m6252\u001b[0m  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m in \u001b[92m_raise_if_missing\u001b[0m                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6249 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mKeyError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mNone of [\u001b[0m\u001b[33m{\u001b[0mkey\u001b[33m}\u001b[0m\u001b[33m] are in the [\u001b[0m\u001b[33m{\u001b[0maxis_name\u001b[33m}\u001b[0m\u001b[33m]\u001b[0m\u001b[33m\"\u001b[0m)               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6250 \u001b[0m\u001b[2m│   │   │   \u001b[0m                                                                              \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6251 \u001b[0m\u001b[2m│   │   │   \u001b[0mnot_found = \u001b[96mlist\u001b[0m(ensure_index(key)[missing_mask.nonzero()[\u001b[94m0\u001b[0m]].unique())       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m6252 \u001b[2m│   │   │   \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mKeyError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m{\u001b[0mnot_found\u001b[33m}\u001b[0m\u001b[33m not in index\u001b[0m\u001b[33m\"\u001b[0m)                                   \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6253 \u001b[0m\u001b[2m│   \u001b[0m                                                                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6254 \u001b[0m\u001b[2m│   \u001b[0m\u001b[1;95m@overload\u001b[0m                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m6255 \u001b[0m\u001b[2m│   \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m_get_indexer_non_comparable\u001b[0m(                                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+       "\u001b[1;91mKeyError: \u001b[0m\u001b[32m\"\u001b[0m\u001b[32m[\u001b[0m\u001b[32m'economy'\u001b[0m\u001b[32m]\u001b[0m\u001b[32m not in index\"\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -317,19 +441,32 @@
     "    \"income_group\": \"income_group\",\n",
     "}\n",
     "df = df.reset_index()\n",
-    "df = df[column_keep_rename].rename(columns=column_keep_rename)"
+    "df = df[column_keep_rename.keys()].rename(columns=column_keep_rename)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "id": "9883222d-aa14-44e3-87cd-4281318191b0",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
-       "<div>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "             <h2 style=\"margin-bottom: 0em\"><pre>wb_income_group</pre></h2>\n",
+       "             <p style=\"font-variant: small-caps; font-size: 1.5em; font-family: sans-serif; color: grey; margin-top: -0.2em; margin-bottom: 0.2em\">table</p>\n",
+       "             <div>\n",
        "<style scoped>\n",
        "    .dataframe tbody tr th:only-of-type {\n",
        "        vertical-align: middle;\n",
@@ -347,6 +484,7 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
+       "      <th>index</th>\n",
        "      <th>country</th>\n",
        "      <th>income_group</th>\n",
        "    </tr>\n",
@@ -354,43 +492,50 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
+       "      <td>0</td>\n",
        "      <td>Aruba</td>\n",
        "      <td>High income</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
+       "      <td>1</td>\n",
        "      <td>Afghanistan</td>\n",
        "      <td>Low income</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
+       "      <td>2</td>\n",
        "      <td>Angola</td>\n",
        "      <td>Lower middle income</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
+       "      <td>3</td>\n",
        "      <td>Albania</td>\n",
        "      <td>Upper middle income</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
+       "      <td>4</td>\n",
        "      <td>Andorra</td>\n",
        "      <td>High income</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "</div>"
+       "</div>\n",
+       "        "
       ],
       "text/plain": [
-       "       country         income_group\n",
-       "0        Aruba          High income\n",
-       "1  Afghanistan           Low income\n",
-       "2       Angola  Lower middle income\n",
-       "3      Albania  Upper middle income\n",
-       "4      Andorra          High income"
+       "\n",
+       "   index      country         income_group\n",
+       "\u001b[1;36m0\u001b[0m      \u001b[1;36m0\u001b[0m        Aruba          High income\n",
+       "\u001b[1;36m1\u001b[0m      \u001b[1;36m1\u001b[0m  Afghanistan           Low income\n",
+       "\u001b[1;36m2\u001b[0m      \u001b[1;36m2\u001b[0m       Angola  Lower middle income\n",
+       "\u001b[1;36m3\u001b[0m      \u001b[1;36m3\u001b[0m      Albania  Upper middle income\n",
+       "\u001b[1;36m4\u001b[0m      \u001b[1;36m4\u001b[0m      Andorra          High income"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -675,7 +820,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.10"
+   "version": "3.10.0"
   }
  },
  "nbformat": 4,
diff --git a/etl/steps/data/garden/who/2023-04-03/flu_vaccine_policy.py b/etl/steps/data/garden/who/2023-04-03/flu_vaccine_policy.py
index 2d9c72697fd..e176b91308c 100644
--- a/etl/steps/data/garden/who/2023-04-03/flu_vaccine_policy.py
+++ b/etl/steps/data/garden/who/2023-04-03/flu_vaccine_policy.py
@@ -1,7 +1,7 @@
 """Load a meadow dataset and create a garden dataset."""
 import numpy as np
 import pandas as pd
-from owid.catalog import Dataset, Table
+from owid.catalog import Table
 from structlog import get_logger
 
 from etl.data_helpers import geo
@@ -20,13 +20,13 @@ def run(dest_dir: str) -> None:
     # Load inputs.
     #
     # Load meadow dataset.
-    ds_meadow: Dataset = paths.load_dependency("flu_vaccine_policy")
+    ds_meadow = paths.load_dataset("flu_vaccine_policy")
 
     # Read table from meadow dataset.
     tb_meadow = ds_meadow["flu_vaccine_policy"]
 
     # Create a dataframe with data from the table.
-    df = pd.DataFrame(tb_meadow).reset_index()
+    df = pd.DataFrame(tb_meadow).reset_index().astype(str).astype({"year": int})
 
     #
     # Process data.
@@ -36,7 +36,7 @@ def run(dest_dir: str) -> None:
         df=df, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
     )
     # Replacing value codes with either missing data or a more descriptive value
-    df = df.replace({"ND": np.NaN, "NR": "Not relevant", "Unknown": np.NaN})
+    df = df.replace({"ND": np.NaN, "nan": np.NaN, "NR": "Not relevant", "Unknown": np.NaN})
     # Removing strings from some values e.g. commas in numbers but not full-stops
     df["how_many_doses_of_influenza_vaccine_were_distributed"] = df[
         "how_many_doses_of_influenza_vaccine_were_distributed"
diff --git a/etl/steps/data/garden/who/2024-01-03/gho.py b/etl/steps/data/garden/who/2024-01-03/gho.py
index 6aeae2e9974..55a76192571 100644
--- a/etl/steps/data/garden/who/2024-01-03/gho.py
+++ b/etl/steps/data/garden/who/2024-01-03/gho.py
@@ -4,6 +4,7 @@
 from typing import Any
 
 import numpy as np
+import owid.catalog.processing as pr
 import pandas as pd
 import structlog
 from owid.catalog import Table, VariableMeta
@@ -208,7 +209,11 @@ def merge_identical_tables(tables: list[Table]) -> list[Table]:
         for k, existing_tb in enumerate(new_tables):
             if existing_tb.m.short_name == tb.m.short_name:
                 # It's possible that new table has different index
-                new_tables[k] = existing_tb.reset_index().append(tb.reset_index()).set_index(existing_tb.index.names)
+                new_tables[k] = (
+                    pr.concat([existing_tb.reset_index(), tb.reset_index()], ignore_index=True)
+                    .set_index(existing_tb.index.names)
+                    .copy_metadata(existing_tb)
+                )
                 new_tables[k] = new_tables[k][~new_tables[k].index.duplicated()]
                 break
 
diff --git a/etl/steps/data/garden/wvs/2023-06-25/longitudinal_wvs.py b/etl/steps/data/garden/wvs/2023-06-25/longitudinal_wvs.py
index 2673cf9a4c2..88f0dbf2574 100644
--- a/etl/steps/data/garden/wvs/2023-06-25/longitudinal_wvs.py
+++ b/etl/steps/data/garden/wvs/2023-06-25/longitudinal_wvs.py
@@ -32,6 +32,9 @@ def run(dest_dir: str) -> None:
     log.info("longitudinal_wvs.harmonize_countries")
     tb: Table = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
 
+    cols = [c for c in tb.columns if c not in ["country", "year"]]
+    tb[cols] = tb[cols].astype(float)
+
     # Compute the ratio of responses indicating a great deal or very much worry about terrorist attacks
     q1 = question_1(tb)
     # Compute the ratio of agree to disagree responses regarding the effects of immigrants on the risks of terrorism.
diff --git a/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_anomalies.py b/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_anomalies.py
index b111f36b2ec..33290d97f4b 100644
--- a/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_anomalies.py
+++ b/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_anomalies.py
@@ -1,6 +1,5 @@
 """Load a garden dataset and create a grapher dataset."""
 
-
 import owid.catalog.processing as pr
 
 from etl.helpers import PathFinder, create_dataset
diff --git a/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_monthly.py b/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_monthly.py
index a74343922d0..7fcecb74c13 100644
--- a/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_monthly.py
+++ b/etl/steps/data/grapher/climate/2023-12-20/surface_temperature_monthly.py
@@ -1,6 +1,5 @@
 """Load a garden dataset and create a grapher dataset."""
 
-
 import owid.catalog.processing as pr
 
 from etl.helpers import PathFinder, create_dataset
diff --git a/etl/steps/data/meadow/artificial_intelligence/2023-06-07/monmouth_poll.py b/etl/steps/data/meadow/artificial_intelligence/2023-06-07/monmouth_poll.py
index 3c7601f07be..6b83056139a 100644
--- a/etl/steps/data/meadow/artificial_intelligence/2023-06-07/monmouth_poll.py
+++ b/etl/steps/data/meadow/artificial_intelligence/2023-06-07/monmouth_poll.py
@@ -157,7 +157,7 @@ def preprocess_data(dataframes, texts):
         # Add 'question' name the answer column as a 'country' columns
         question = texts[index]
         dataframe["question"] = question
-        dataframe.rename(columns={column_names[0]: "answer"}, inplace=True)
+        dataframe.rename(columns={column_names.iloc[0]: "answer"}, inplace=True)
 
         # Reset the index
         dataframe.reset_index(drop=True, inplace=True)
diff --git a/etl/steps/data/meadow/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py b/etl/steps/data/meadow/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py
index 1e7a6ceaddf..e2cb9d89f7c 100644
--- a/etl/steps/data/meadow/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py
+++ b/etl/steps/data/meadow/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py
@@ -61,6 +61,8 @@ def run(dest_dir: str) -> None:
     merged_df = pd.merge(merged_df, df_q3, on="options", how="outer")
     merged_df = pd.merge(merged_df, df_q3_age, on="options", how="outer")
 
+    assert "Somewhat" in merged_df.options.values
+
     # Deterministic sorting
     merged_df = merged_df.sort_values("options")
 
@@ -125,8 +127,8 @@ def question_1_df(snap: Snapshot, question: str) -> pd.DataFrame:
     concern_df = pd.DataFrame({"options": question1.iloc[:, 1], question: question1.iloc[:, -2]})
 
     # Clean up the data in the question column (remove % sign and dots and convert to numeric)
-    concern_df[question] = concern_df[question].str.replace("%", "", regex=True)
-    concern_df[question] = concern_df[question].str.replace(".", "", regex=True)
+    concern_df[question] = concern_df[question].str.replace("%", "", regex=False)
+    concern_df[question] = concern_df[question].str.replace(".", "", regex=False)
     concern_df[question] = pd.to_numeric(concern_df[question])
 
     # Drop completely empty rows
diff --git a/etl/steps/data/meadow/artificial_intelligence/2023-06-08/yougov_jobs.py b/etl/steps/data/meadow/artificial_intelligence/2023-06-08/yougov_jobs.py
index d9001cccebb..9c2ce970391 100644
--- a/etl/steps/data/meadow/artificial_intelligence/2023-06-08/yougov_jobs.py
+++ b/etl/steps/data/meadow/artificial_intelligence/2023-06-08/yougov_jobs.py
@@ -38,7 +38,7 @@ def run(dest_dir: str) -> None:
 
     df1 = process_job_title_data(snap.path, questions[0])
     df2 = process_activity_data(snap.path, questions[1])
-    merged_df = pd.merge(df1, df2, how="outer")
+    merged_df = pd.merge(df1, df2, how="outer", validate="1:1")
 
     # Create a new table and ensure all columns are snake-case.
     tb = Table(merged_df, short_name=paths.short_name, underscore=True)
diff --git a/etl/steps/data/meadow/climate/2024-01-28/global_sea_level.py b/etl/steps/data/meadow/climate/2024-01-28/global_sea_level.py
index e552b7cd3f8..0e6394f8c0a 100644
--- a/etl/steps/data/meadow/climate/2024-01-28/global_sea_level.py
+++ b/etl/steps/data/meadow/climate/2024-01-28/global_sea_level.py
@@ -30,7 +30,7 @@ def fix_date_column(tb: Table) -> Table:
     tb.loc[(tb.index >= index_second), "date"] = tb["date"].str[0:-2] + "20" + tb["date"].str[-2:]
 
     # Ensure all dates have a reasonable format.
-    tb["date"] = pd.to_datetime(tb["date"]).dt.date.astype(str)
+    tb["date"] = pd.to_datetime(tb["date"], format="mixed").dt.date.astype(str)
 
     return tb
 
diff --git a/etl/steps/data/meadow/health/2023-05-04/global_wellbeing.py b/etl/steps/data/meadow/health/2023-05-04/global_wellbeing.py
index efa493cec06..e422c5d1fdf 100644
--- a/etl/steps/data/meadow/health/2023-05-04/global_wellbeing.py
+++ b/etl/steps/data/meadow/health/2023-05-04/global_wellbeing.py
@@ -49,7 +49,13 @@ def run(dest_dir: str) -> None:
     # Table object does not accept MultiLevel columns, therefore we do this reshaping here in Meadow
     log.info("global_wellbeing: unpivot")
     col_id = df.columns[:3]
-    df = df.melt(id_vars=list(col_id), var_name=["question", "answer"], value_name="share")
+    df = df.melt(id_vars=list(col_id), value_name="share")
+    df = df.rename(
+        columns={
+            "variable_0": "question",
+            "variable_1": "answer",
+        }
+    )
 
     # Create index columns
     log.info("global_wellbeing: build index columns")
diff --git a/etl/steps/data/meadow/oecd/2023-05-18/co2_air_transport.py b/etl/steps/data/meadow/oecd/2023-05-18/co2_air_transport.py
index 91f47b400a0..9c3e26a3664 100644
--- a/etl/steps/data/meadow/oecd/2023-05-18/co2_air_transport.py
+++ b/etl/steps/data/meadow/oecd/2023-05-18/co2_air_transport.py
@@ -35,7 +35,7 @@ def run(dest_dir: str) -> None:
     df = df.rename(columns=rename_cols)[rename_cols.values()]
 
     # Convert the 'year' column to datetime
-    df["year"] = pd.to_datetime(df["year"])
+    df["year"] = pd.to_datetime(df["year"], format="mixed")
 
     # Extract the month and year from 'year' column and create new columns
     df["Month"] = df["year"].dt.month
diff --git a/etl/steps/data/meadow/tourism/2023-05-05/unwto.py b/etl/steps/data/meadow/tourism/2023-05-05/unwto.py
index d2634364019..6e1ea70e5aa 100644
--- a/etl/steps/data/meadow/tourism/2023-05-05/unwto.py
+++ b/etl/steps/data/meadow/tourism/2023-05-05/unwto.py
@@ -83,13 +83,13 @@ def process_sheet(excel_object: pd.ExcelFile, sheet_name: str, year_range: tuple
     df = df.melt(id_vars=non_year_cols, value_vars=years, var_name="year")
 
     # Fill missing country names with the previous valid value
-    df["country"].fillna(method="ffill", inplace=True)
+    df["country"] = df["country"].ffill()
 
     # Drop rows with all NaN values in columns other than 'country', 'year', and 'value'
     df.dropna(subset=df.columns.difference(["country", "year", "value"]), how="all", inplace=True)
     if sheet_name in ["Inbound Tourism-Accommodation", "Domestic Tourism-Accommodation"]:
         cols_to_fill = [col for col in df.columns if col not in ["country", "year", "value", "Unnamed: 6"]]
-        df[cols_to_fill] = df[cols_to_fill].fillna(method="ffill")
+        df[cols_to_fill] = df[cols_to_fill].ffill()
 
     # Combine remaining columns to create the 'indicator' column
     df["indicator"] = df.drop(columns=["country", "value", "year"]).apply(
@@ -138,6 +138,7 @@ def process_data(excel_object: pd.ExcelFile, year_range: tuple, matched_sheet_na
 
     # Concatenate all the processed DataFrames
     df_concat = pd.concat(data_frames, axis=0)
+    df_concat = df_concat.astype({"value": float})
     df_concat.reset_index(inplace=True)
 
     # Pivot the DataFrame to have 'indicator' as columns and 'value' as cell values
diff --git a/etl/steps/data/meadow/tourism/2023-05-09/unwto_environment.py b/etl/steps/data/meadow/tourism/2023-05-09/unwto_environment.py
index 193164d0def..daacdec401b 100644
--- a/etl/steps/data/meadow/tourism/2023-05-09/unwto_environment.py
+++ b/etl/steps/data/meadow/tourism/2023-05-09/unwto_environment.py
@@ -1,4 +1,5 @@
 """Load a snapshot and create a meadow dataset."""
+
 import pandas as pd
 import shared as shrd
 from owid.catalog import Table
diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py
index 2d3b2b7a9e6..db04e2e914a 100644
--- a/lib/catalog/owid/catalog/tables.py
+++ b/lib/catalog/owid/catalog/tables.py
@@ -31,7 +31,6 @@
 import structlog
 from pandas._typing import FilePath, ReadCsvBuffer, Scalar  # type: ignore
 from pandas.core.series import Series
-from pandas.util._decorators import rewrite_axis_style_signature
 
 from owid.repack import repack_frame
 
@@ -425,10 +424,24 @@ def equals_table(self, table: "Table") -> bool:
             and self._fields == table._fields
         )
 
-    @rewrite_axis_style_signature(
-        "mapper",
-        [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")],
-    )
+    @overload
+    def rename(
+        self,
+        mapper: Any = None,
+        *,
+        inplace: Literal[True],
+        **kwargs: Any,
+    ) -> None:
+        ...
+
+    @overload
+    def rename(self, mapper: Any = None, *, inplace: Literal[False], **kwargs: Any) -> "Table":
+        ...
+
+    @overload
+    def rename(self, *args: Any, **kwargs: Any) -> "Table":
+        ...
+
     def rename(self, *args: Any, **kwargs: Any) -> Optional["Table"]:
         """Rename columns while keeping their metadata."""
         inplace = kwargs.get("inplace")
@@ -917,29 +930,9 @@ def __ipow__(self, other: Union[Scalar, Series, variables.Variable, "Table"]) ->
     def sort_index(self, *args, **kwargs) -> "Table":
         return super().sort_index(*args, **kwargs)  # type: ignore
 
-    def groupby(self, *args, observed=False, **kwargs) -> "TableGroupBy":
+    def groupby(self, *args, **kwargs) -> "TableGroupBy":
         """Groupby that preserves metadata."""
-        if observed is False and args:
-            by_list = [args[0]] if isinstance(args[0], str) else args[0]
-            for by in by_list:
-                if isinstance(by, str):
-                    try:
-                        by_type = self.dtypes[by] if by in self.dtypes else self.index.dtypes[by]  # type: ignore
-                    except AttributeError:
-                        by_type = by
-                elif isinstance(by, pd.Series):
-                    by_type = by.dtype
-                else:
-                    by_type = "unknown"
-                if isinstance(by_type, str) and by_type == "category":
-                    warnings.warn(
-                        f"You're grouping by categorical variable `{by}` without using observed=True. This may lead to unexpected behaviour.",
-                        warnings.GroupingByCategoricalWarning,
-                    )
-
-        return TableGroupBy(
-            pd.DataFrame.groupby(self.copy(deep=False), *args, observed=observed, **kwargs), self.metadata, self._fields
-        )
+        return TableGroupBy(pd.DataFrame.groupby(self.copy(deep=False), *args, **kwargs), self.metadata, self._fields)
 
     def check_metadata(self, ignore_columns: Optional[List[str]] = None) -> None:
         """Check that all variables in the table have origins."""
@@ -979,6 +972,12 @@ def fillna(self, value=None, **kwargs) -> "Table":
         tb = cast(Table, tb)
         return tb
 
+    @classmethod
+    def from_records(cls, *args, **kwargs) -> "Table":
+        """Calling Table.from_records returns a Table, but does not call __init__ and misses metadata."""
+        df = super().from_records(*args, **kwargs)
+        return Table(df)
+
 
 def _create_table(df: pd.DataFrame, metadata: TableMeta, fields: Dict[str, VariableMeta]) -> Table:
     """Create a table with metadata."""
@@ -1030,7 +1029,7 @@ def func(*args, **kwargs):
             return func
         else:
             self.__annotations__[name] = VariableGroupBy
-            return VariableGroupBy(getattr(self.groupby, name), name, self._fields[name])
+            return VariableGroupBy(getattr(self.groupby, name), name, self._fields[name], self.metadata)
 
     @overload
     def __getitem__(self, key: str) -> "VariableGroupBy":
@@ -1045,7 +1044,7 @@ def __getitem__(self, key: Union[str, list]) -> Union["VariableGroupBy", "TableG
             return TableGroupBy(self.groupby[key], self.metadata, self._fields)
         else:
             self.__annotations__[key] = VariableGroupBy
-            return VariableGroupBy(self.groupby[key], key, self._fields[key])
+            return VariableGroupBy(self.groupby[key], key, self._fields[key], self.metadata)
 
     def __iter__(self) -> Iterator[Tuple[Any, "Table"]]:
         for name, group in self.groupby:
@@ -1072,12 +1071,18 @@ def agg(self, func: Optional[Any] = None, *args, **kwargs) -> "Table":
 
 
 class VariableGroupBy:
-    def __init__(self, groupby: pd.core.groupby.SeriesGroupBy, name: str, metadata: VariableMeta):
+    def __init__(
+        self, groupby: pd.core.groupby.SeriesGroupBy, name: str, metadata: VariableMeta, table_metadata: TableMeta
+    ):
         self.groupby = groupby
         self.metadata = metadata
         self.name = name
+        self.table_metadata = table_metadata
 
     def __getattr__(self, funcname) -> Callable[..., "Table"]:
+        if funcname == "groupings":
+            return self.groupby.groupings
+
         def func(*args, **kwargs):
             """Apply function and return variable with proper metadata."""
             # out = getattr(self.groupby, funcname)(*args, **kwargs)
@@ -1087,6 +1092,7 @@ def func(*args, **kwargs):
             # this happens when we use e.g. agg([min, max]), propagate metadata from the original then
             if isinstance(out, Table):
                 out._fields = defaultdict(VariableMeta, {k: self.metadata for k in out.columns})
+                out.metadata = self.table_metadata.copy()
                 return out
             elif isinstance(out, variables.Variable):
                 out.metadata = self.metadata.copy()
@@ -1118,7 +1124,16 @@ def merge(
     # Create merged table.
     tb = Table(
         pd.merge(
-            left=left, right=right, how=how, on=on, left_on=left_on, right_on=right_on, suffixes=suffixes, **kwargs
+            # There's a weird bug that removes metadata of the left table. I could not replicate it with unit test
+            # It is necessary to copy metadata here to avoid mutating passed left.
+            left=left.copy(deep=False),
+            right=right.copy(deep=False),
+            how=how,
+            on=on,
+            left_on=left_on,
+            right_on=right_on,
+            suffixes=suffixes,
+            **kwargs,
         )
     )
 
@@ -1189,7 +1204,17 @@ def concat(
     **kwargs,
 ) -> Table:
     # TODO: Add more logic to this function to handle indexes and possibly other arguments.
-    table = Table(pd.concat(objs=objs, axis=axis, join=join, ignore_index=ignore_index, **kwargs))  # type: ignore
+    with warnings.catch_warnings():
+        warnings.simplefilter(action="ignore", category=FutureWarning)
+        table = Table(
+            pd.concat(
+                objs=objs,
+                axis=axis,  # type: ignore
+                join=join,
+                ignore_index=ignore_index,
+                **kwargs,
+            )
+        )
 
     if (axis == 1) or (axis == "columns"):
         # Original function pd.concat allows returning a dataframe with multiple columns with the same name.
@@ -1306,13 +1331,17 @@ def pivot(
     short_name: Optional[str] = None,
     **kwargs,
 ) -> Table:
+    if index is not None:
+        kwargs["index"] = index
+    if columns is not None:
+        kwargs["columns"] = columns
+    if values is not None:
+        kwargs["values"] = values
+
     # Get the new pivot table.
     table = Table(
         pd.pivot(
             data=data,
-            index=index,
-            columns=columns,
-            values=values,
             **kwargs,
         )
     )
@@ -1609,18 +1638,22 @@ def copy_metadata(from_table: Table, to_table: Table, deep=False) -> Table:
 
 def get_unique_sources_from_tables(tables: List[Table]) -> List[Source]:
     # Make a list of all sources of all variables in all tables.
-    sources = sum([table._fields[column].sources for table in tables for column in list(table.all_columns)], [])
-
-    # Get unique array of tuples of source fields (respecting the order).
-    return pd.unique(sources).tolist()
+    sources = []
+    for table in tables:
+        for column in list(table.all_columns):
+            # Get unique array of tuples of source fields (respecting the order).
+            sources += [source for source in table._fields[column].sources if source not in sources]
+    return sources
 
 
 def get_unique_licenses_from_tables(tables: List[Table]) -> List[License]:
     # Make a list of all licenses of all variables in all tables.
-    licenses = sum([table._fields[column].licenses for table in tables for column in list(table.all_columns)], [])
-
-    # Get unique array of tuples of source fields (respecting the order).
-    return pd.unique(licenses).tolist()
+    licenses = []
+    for table in tables:
+        for column in list(table.all_columns):
+            # Get unique array of tuples of source fields (respecting the order).
+            licenses += [license for license in table._fields[column].licenses if license not in licenses]
+    return licenses
 
 
 def _get_metadata_value_from_tables_if_all_identical(tables: List[Table], field: str) -> Optional[Any]:
diff --git a/lib/catalog/poetry.lock b/lib/catalog/poetry.lock
index 11103f96fae..ed0e49a2e0f 100644
--- a/lib/catalog/poetry.lock
+++ b/lib/catalog/poetry.lock
@@ -630,18 +630,18 @@ files = [
 
 [[package]]
 name = "owid-repack"
-version = "0.1.2"
+version = "0.1.3"
 description = "Pack Pandas data frames into smaller, more memory-efficient data types."
 optional = false
 python-versions = ">=3.8.1"
 files = [
-    {file = "owid_repack-0.1.2-py3-none-any.whl", hash = "sha256:6c1a098aeb32b8fedee225990abf48141892a15ff1d492291483e4cca40d2860"},
-    {file = "owid_repack-0.1.2.tar.gz", hash = "sha256:e583e022b548e9e9e40d5dab6c3d8026c80a293bbd677df487908a00f1f01a7e"},
+    {file = "owid_repack-0.1.3-py3-none-any.whl", hash = "sha256:c1a5e58964e4d83db6377f286cfc1c766aa60ae58d9a37003bdcda0194957b26"},
+    {file = "owid_repack-0.1.3.tar.gz", hash = "sha256:b65075af87c63945795801a2d7fd744f3d9a47ce7faa20736f389051655bff4a"},
 ]
 
 [package.dependencies]
-numpy = ">=1.24.0,<2.0.0"
-pandas = ">=1.5.2,<2.0.0"
+numpy = ">=1.24.0"
+pandas = ">=1.5.2"
 
 [[package]]
 name = "packaging"
@@ -656,51 +656,75 @@ files = [
 
 [[package]]
 name = "pandas"
-version = "1.5.3"
+version = "2.2.1"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406"},
-    {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572"},
-    {file = "pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996"},
-    {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354"},
-    {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23"},
-    {file = "pandas-1.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328"},
-    {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc"},
-    {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d"},
-    {file = "pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"},
-    {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae"},
-    {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6"},
-    {file = "pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003"},
-    {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813"},
-    {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31"},
-    {file = "pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792"},
-    {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7"},
-    {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf"},
-    {file = "pandas-1.5.3-cp38-cp38-win32.whl", hash = "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51"},
-    {file = "pandas-1.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373"},
-    {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa"},
-    {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee"},
-    {file = "pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a"},
-    {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0"},
-    {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5"},
-    {file = "pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a"},
-    {file = "pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9"},
-    {file = "pandas-1.5.3.tar.gz", hash = "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1"},
+python-versions = ">=3.9"
+files = [
+    {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"},
+    {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"},
+    {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"},
+    {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"},
+    {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"},
+    {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"},
+    {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"},
+    {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"},
+    {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"},
+    {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"},
+    {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"},
+    {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"},
+    {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"},
+    {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"},
+    {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"},
+    {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"},
+    {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"},
+    {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"},
+    {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"},
+    {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"},
+    {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"},
+    {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"},
+    {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"},
+    {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"},
+    {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"},
+    {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"},
+    {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"},
+    {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"},
+    {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"},
 ]
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
-    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""},
 ]
-python-dateutil = ">=2.8.1"
+python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
+tzdata = ">=2022.7"
 
 [package.extras]
-test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
 
 [[package]]
 name = "pandas-stubs"
@@ -1317,6 +1341,17 @@ files = [
 mypy-extensions = ">=0.3.0"
 typing-extensions = ">=3.7.4"
 
+[[package]]
+name = "tzdata"
+version = "2024.1"
+description = "Provider of IANA time zone data"
+optional = false
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
+    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
+]
+
 [[package]]
 name = "unidecode"
 version = "1.3.6"
@@ -1421,4 +1456,4 @@ viz = ["matplotlib", "nc-time-axis", "seaborn"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9, <3.12"
-content-hash = "62d20a8d80b743e8284bfb4bd6f018efc36bea799694515d0a4ce5c631525484"
+content-hash = "c09f724804bbaaf94c6c500c5866706806064466d5a4263457821b8d7001e170"
diff --git a/lib/catalog/pyproject.toml b/lib/catalog/pyproject.toml
index 93c15d88a93..a622aca6481 100644
--- a/lib/catalog/pyproject.toml
+++ b/lib/catalog/pyproject.toml
@@ -11,7 +11,7 @@ homepage = "https://github.com/owid/owid-grapher-py"
 
 [tool.poetry.dependencies]
 python = ">=3.9, <3.12"
-pandas = ">=1.3.3,<2.0"
+pandas = "^2.2.1"
 jsonschema = ">=3.2.0"
 pyarrow = ">=10.0.1"
 ipdb = ">=0.13.9"
diff --git a/lib/catalog/tests/test_tables.py b/lib/catalog/tests/test_tables.py
index af1f7078504..4e647379b6d 100644
--- a/lib/catalog/tests/test_tables.py
+++ b/lib/catalog/tests/test_tables.py
@@ -574,6 +574,12 @@ def test_merge_with_left_on_and_right_on_argument(table_1, table_2, sources, ori
     assert tb.metadata.description is None
 
 
+def test_merge_keeps_metadata(table_1, table_2, origins) -> None:
+    table_1.a.m.origins = [origins[1]]
+    _ = tables.merge(table_1, table_2, on=["country", "year"])
+    assert table_1.a.m.origins == [origins[1]]
+
+
 def test_concat_with_axis_0(table_1, table_2, sources, origins, licenses) -> None:
     tb = tables.concat([table_1, table_2])
     # Column "country" has the same title on both tables, and has description only on table_1, therefore when combining
@@ -998,6 +1004,14 @@ def test_groupby_levels(table_1) -> None:
     assert gt.a.m.title == "Title of Table 1 Variable a"
 
 
+def test_groupby_as_index(table_1) -> None:
+    table_1.m.title = "Table 1"
+    table_1 = table_1.astype({"country": "category"})
+    gt = table_1.groupby(["country", "year"], as_index=False)["a"].min()
+    assert gt.m.primary_key == []
+    assert gt.m.title == "Table 1"
+
+
 def test_set_columns(table_1) -> None:
     table_1.columns = ["country", "year", "new_a", "new_b"]
     assert table_1.new_a.m.title == "Title of Table 1 Variable a"
diff --git a/lib/datautils/owid/datautils/dataframes.py b/lib/datautils/owid/datautils/dataframes.py
index fbdb68832c1..907ec34426e 100644
--- a/lib/datautils/owid/datautils/dataframes.py
+++ b/lib/datautils/owid/datautils/dataframes.py
@@ -538,7 +538,9 @@ def concatenate(dfs: List[pd.DataFrame], **kwargs: Any) -> pd.DataFrame:
         for df in dfs:
             df[col] = pd.Categorical(df[col].values, categories=uc.categories)
 
-    return pd.concat(dfs, **kwargs)
+    with warnings.catch_warnings():
+        warnings.simplefilter(action="ignore", category=FutureWarning)
+        return pd.concat(dfs, **kwargs)
 
 
 def apply_on_categoricals(cat_series: List[pd.Series], func: Callable[..., str]) -> pd.Series:
diff --git a/lib/datautils/poetry.lock b/lib/datautils/poetry.lock
index a5063a098f9..762cfdf1ec8 100644
--- a/lib/datautils/poetry.lock
+++ b/lib/datautils/poetry.lock
@@ -50,9 +50,6 @@ files = [
     {file = "Babel-2.12.1.tar.gz", hash = "sha256:cc2d99999cd01d44420ae725a21c9e3711b3aadc7976d6147f622d8581963455"},
 ]
 
-[package.dependencies]
-pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""}
-
 [[package]]
 name = "backcall"
 version = "0.2.0"
@@ -116,7 +113,6 @@ files = [
 botocore-stubs = "*"
 mypy-boto3-s3 = {version = ">=1.28.0,<1.29.0", optional = true, markers = "extra == \"s3\""}
 types-s3transfer = "*"
-typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.9\""}
 
 [package.extras]
 accessanalyzer = ["mypy-boto3-accessanalyzer (>=1.28.0,<1.29.0)"]
@@ -506,7 +502,6 @@ files = [
 
 [package.dependencies]
 types-awscrt = "*"
-typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.9\""}
 
 [[package]]
 name = "bump2version"
@@ -836,34 +831,34 @@ toml = ["tomli"]
 
 [[package]]
 name = "cryptography"
-version = "41.0.6"
+version = "41.0.7"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"},
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"},
-    {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"},
-    {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"},
-    {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"},
+    {file = "cryptography-41.0.7-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c78451b78313fa81607fa1b3f1ae0a5ddd8014c38a02d9db0616133987b9cdf"},
+    {file = "cryptography-41.0.7-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:928258ba5d6f8ae644e764d0f996d61a8777559f72dfeb2eea7e2fe0ad6e782d"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a1b41bc97f1ad230a41657d9155113c7521953869ae57ac39ac7f1bb471469a"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:841df4caa01008bad253bce2a6f7b47f86dc9f08df4b433c404def869f590a15"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5429ec739a29df2e29e15d082f1d9ad683701f0ec7709ca479b3ff2708dae65a"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:43f2552a2378b44869fe8827aa19e69512e3245a219104438692385b0ee119d1"},
+    {file = "cryptography-41.0.7-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:af03b32695b24d85a75d40e1ba39ffe7db7ffcb099fe507b39fd41a565f1b157"},
+    {file = "cryptography-41.0.7-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:49f0805fc0b2ac8d4882dd52f4a3b935b210935d500b6b805f321addc8177406"},
+    {file = "cryptography-41.0.7-cp37-abi3-win32.whl", hash = "sha256:f983596065a18a2183e7f79ab3fd4c475205b839e02cbc0efbbf9666c4b3083d"},
+    {file = "cryptography-41.0.7-cp37-abi3-win_amd64.whl", hash = "sha256:90452ba79b8788fa380dfb587cca692976ef4e757b194b093d845e8d99f612f2"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:079b85658ea2f59c4f43b70f8119a52414cdb7be34da5d019a77bf96d473b960"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b640981bf64a3e978a56167594a0e97db71c89a479da8e175d8bb5be5178c003"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e3114da6d7f95d2dee7d3f4eec16dacff819740bbab931aff8648cb13c5ff5e7"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d5ec85080cce7b0513cfd233914eb8b7bbd0633f1d1703aa28d1dd5a72f678ec"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7a698cb1dac82c35fcf8fe3417a3aaba97de16a01ac914b89a0889d364d2f6be"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:37a138589b12069efb424220bf78eac59ca68b95696fc622b6ccc1c0a197204a"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:68a2dec79deebc5d26d617bfdf6e8aab065a4f34934b22d3b5010df3ba36612c"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:09616eeaef406f99046553b8a40fbf8b1e70795a91885ba4c96a70793de5504a"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48a0476626da912a44cc078f9893f292f0b3e4c739caf289268168d8f4702a39"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c7f3201ec47d5207841402594f1d7950879ef890c0c495052fa62f58283fde1a"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c5ca78485a255e03c32b513f8c2bc39fedb7f5c5f8535545bdc223a03b24f248"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d6c391c021ab1f7a82da5d8d0b3cee2f4b2c455ec86c8aebbc84837a631ff309"},
+    {file = "cryptography-41.0.7.tar.gz", hash = "sha256:13f93ce9bea8016c253b34afc6bd6a75993e5c40672ed5405a9c832f0d4a00bc"},
 ]
 
 [package.dependencies]
@@ -1570,9 +1565,6 @@ files = [
     {file = "mypy_boto3_s3-1.28.8-py3-none-any.whl", hash = "sha256:75b929c517c5ad8f97c14dfba5f8521db569157dc4ac76a07a178805777cff8c"},
 ]
 
-[package.dependencies]
-typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.9\""}
-
 [[package]]
 name = "nest-asyncio"
 version = "1.5.6"
@@ -1600,39 +1592,47 @@ setuptools = "*"
 
 [[package]]
 name = "numpy"
-version = "1.24.4"
+version = "1.26.4"
 description = "Fundamental package for array computing in Python"
 optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"},
-    {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"},
-    {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"},
-    {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"},
-    {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"},
-    {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"},
-    {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"},
-    {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"},
-    {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"},
-    {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"},
-    {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"},
-    {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"},
-    {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"},
-    {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"},
-    {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"},
-    {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"},
-    {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"},
-    {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"},
-    {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"},
-    {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"},
-    {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"},
-    {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"},
-    {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"},
-    {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"},
-    {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"},
-    {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"},
-    {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"},
-    {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"},
+python-versions = ">=3.9"
+files = [
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
 ]
 
 [[package]]
@@ -1680,51 +1680,76 @@ files = [
 
 [[package]]
 name = "pandas"
-version = "1.5.3"
+version = "2.2.1"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406"},
-    {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572"},
-    {file = "pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996"},
-    {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354"},
-    {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23"},
-    {file = "pandas-1.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328"},
-    {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc"},
-    {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d"},
-    {file = "pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"},
-    {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae"},
-    {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6"},
-    {file = "pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003"},
-    {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813"},
-    {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31"},
-    {file = "pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792"},
-    {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7"},
-    {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf"},
-    {file = "pandas-1.5.3-cp38-cp38-win32.whl", hash = "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51"},
-    {file = "pandas-1.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373"},
-    {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa"},
-    {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee"},
-    {file = "pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a"},
-    {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0"},
-    {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5"},
-    {file = "pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a"},
-    {file = "pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9"},
-    {file = "pandas-1.5.3.tar.gz", hash = "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1"},
+python-versions = ">=3.9"
+files = [
+    {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"},
+    {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"},
+    {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"},
+    {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"},
+    {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"},
+    {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"},
+    {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"},
+    {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"},
+    {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"},
+    {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"},
+    {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"},
+    {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"},
+    {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"},
+    {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"},
+    {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"},
+    {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"},
+    {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"},
+    {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"},
+    {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"},
+    {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"},
+    {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"},
+    {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"},
+    {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"},
+    {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"},
+    {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"},
+    {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"},
+    {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"},
+    {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"},
+    {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"},
 ]
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
-    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""},
 ]
-python-dateutil = ">=2.8.1"
+python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
+tzdata = ">=2022.7"
 
 [package.extras]
-test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
 
 [[package]]
 name = "parso"
@@ -2697,6 +2722,17 @@ files = [
     {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"},
 ]
 
+[[package]]
+name = "tzdata"
+version = "2024.1"
+description = "Provider of IANA time zone data"
+optional = false
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
+    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
+]
+
 [[package]]
 name = "uritemplate"
 version = "4.1.1"
@@ -2791,5 +2827,5 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 
 [metadata]
 lock-version = "2.0"
-python-versions = ">=3.8.1,<4.0"
-content-hash = "78469f7c90340fa76a0b34d1c08cc5c98933e674f99be186273f47b9d1a1f0b7"
+python-versions = ">=3.9,<4.0"
+content-hash = "acbb4c176e2ce346657f271298da3ab55ebe27adce8ace9bb9e065eb56ffad9e"
diff --git a/lib/datautils/pyproject.toml b/lib/datautils/pyproject.toml
index 9866a08fc4f..b0c7b7bbdaa 100644
--- a/lib/datautils/pyproject.toml
+++ b/lib/datautils/pyproject.toml
@@ -18,8 +18,8 @@ keywords = [
 
 
 [tool.poetry.dependencies]
-python = ">=3.8.1,<4.0"
-pandas = ">=1.3.3"
+python = ">=3.9,<4.0"
+pandas = "^2.2.1"
 boto3 = ">=1.21.16"
 structlog = ">=21.5.0"
 colorama = ">=0.4.4"
diff --git a/lib/datautils/tests/test_dataframes.py b/lib/datautils/tests/test_dataframes.py
index 7abac55e080..ebb87011d49 100644
--- a/lib/datautils/tests/test_dataframes.py
+++ b/lib/datautils/tests/test_dataframes.py
@@ -1082,7 +1082,7 @@ def test_combine_dataframes(self):
         expected = pd.DataFrame(
             {
                 "year": [2000, 2001, 2002, 2003],
-                "var_a": [0, 1, 2, 30],
+                "var_a": [0.0, 1.0, 2.0, 30.0],
                 "var_b": ["0", "1", "2", np.nan],
                 "var_c": [np.nan, "10", "20", "30"],
             }
@@ -1104,7 +1104,7 @@ def test_combine_dataframes_inverted_order(self):
         expected = pd.DataFrame(
             {
                 "year": [2000, 2001, 2002, 2003],
-                "var_a": [0, 10, 20, 30],
+                "var_a": [0.0, 10.0, 20.0, 30.0],
                 "var_b": ["0", "1", "2", np.nan],
                 "var_c": [np.nan, "10", "20", "30"],
             }
@@ -1131,7 +1131,7 @@ def test_combine_dataframes_inverted_order_keeping_column_order(self):
         expected = pd.DataFrame(
             {
                 "year": [2000, 2001, 2002, 2003],
-                "var_a": [0, 10, 20, 30],
+                "var_a": [0.0, 10.0, 20.0, 30.0],
                 "var_c": [np.nan, "10", "20", "30"],
                 "var_b": ["0", "1", "2", np.nan],
             }
@@ -1158,7 +1158,7 @@ def test_combine_dataframes_filling_nans_in_df1_with_df2(self):
         expected = pd.DataFrame(
             {
                 "year": [2000, 2001, 2002, 2003],
-                "var_a": [0, 10, 2, 30],
+                "var_a": [0.0, 10.0, 2.0, 30.0],
                 "var_b": ["0", "1", "2", np.nan],
                 "var_c": [np.nan, "10", "20", "30"],
             }
@@ -1194,7 +1194,7 @@ def test_combine_single_index_dataframes(self):
         expected = pd.DataFrame(
             {
                 "year": [2000, 2001, 2002, 2003],
-                "var_a": [0, 1, 2, 30],
+                "var_a": [0.0, 1.0, 2.0, 30.0],
                 "var_b": ["0", "1", "2", np.nan],
                 "var_c": [np.nan, "10", "20", "30"],
             }
@@ -1232,7 +1232,7 @@ def test_combine_multi_index_dataframes(self):
                     "country_b",
                     "country_b",
                 ],
-                "var_a": [1, 0, 2, 20, 3, 30],
+                "var_a": [1.0, 0.0, 2.0, 20.0, 3.0, 30.0],
                 "var_b": ["1", "0", "2", np.nan, "3", np.nan],
                 "var_c": [np.nan, np.nan, "10", "20", np.nan, "30"],
             }
diff --git a/lib/repack/poetry.lock b/lib/repack/poetry.lock
index e3850fdbf25..f5fba9f0edb 100644
--- a/lib/repack/poetry.lock
+++ b/lib/repack/poetry.lock
@@ -105,6 +105,51 @@ files = [
     {file = "numpy-1.24.0.tar.gz", hash = "sha256:c4ab7c9711fe6b235e86487ca74c1b092a6dd59a3cb45b63241ea0a148501853"},
 ]
 
+[[package]]
+name = "numpy"
+version = "1.26.4"
+description = "Fundamental package for array computing in Python"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"},
+    {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4"},
+    {file = "numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a"},
+    {file = "numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2"},
+    {file = "numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07"},
+    {file = "numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71"},
+    {file = "numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e"},
+    {file = "numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a"},
+    {file = "numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a"},
+    {file = "numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20"},
+    {file = "numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218"},
+    {file = "numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b"},
+    {file = "numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a"},
+    {file = "numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0"},
+    {file = "numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110"},
+    {file = "numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c"},
+    {file = "numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764"},
+    {file = "numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd"},
+    {file = "numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c"},
+    {file = "numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6"},
+    {file = "numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c"},
+    {file = "numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0"},
+    {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"},
+]
+
 [[package]]
 name = "packaging"
 version = "22.0"
@@ -118,51 +163,76 @@ files = [
 
 [[package]]
 name = "pandas"
-version = "1.5.2"
+version = "2.2.1"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "pandas-1.5.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e9dbacd22555c2d47f262ef96bb4e30880e5956169741400af8b306bbb24a273"},
-    {file = "pandas-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e2b83abd292194f350bb04e188f9379d36b8dfac24dd445d5c87575f3beaf789"},
-    {file = "pandas-1.5.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2552bffc808641c6eb471e55aa6899fa002ac94e4eebfa9ec058649122db5824"},
-    {file = "pandas-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fc87eac0541a7d24648a001d553406f4256e744d92df1df8ebe41829a915028"},
-    {file = "pandas-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0d8fd58df5d17ddb8c72a5075d87cd80d71b542571b5f78178fb067fa4e9c72"},
-    {file = "pandas-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:4aed257c7484d01c9a194d9a94758b37d3d751849c05a0050c087a358c41ad1f"},
-    {file = "pandas-1.5.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:375262829c8c700c3e7cbb336810b94367b9c4889818bbd910d0ecb4e45dc261"},
-    {file = "pandas-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc3cd122bea268998b79adebbb8343b735a5511ec14efb70a39e7acbc11ccbdc"},
-    {file = "pandas-1.5.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b4f5a82afa4f1ff482ab8ded2ae8a453a2cdfde2001567b3ca24a4c5c5ca0db3"},
-    {file = "pandas-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8092a368d3eb7116e270525329a3e5c15ae796ccdf7ccb17839a73b4f5084a39"},
-    {file = "pandas-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6257b314fc14958f8122779e5a1557517b0f8e500cfb2bd53fa1f75a8ad0af2"},
-    {file = "pandas-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:82ae615826da838a8e5d4d630eb70c993ab8636f0eff13cb28aafc4291b632b5"},
-    {file = "pandas-1.5.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:457d8c3d42314ff47cc2d6c54f8fc0d23954b47977b2caed09cd9635cb75388b"},
-    {file = "pandas-1.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c009a92e81ce836212ce7aa98b219db7961a8b95999b97af566b8dc8c33e9519"},
-    {file = "pandas-1.5.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:71f510b0efe1629bf2f7c0eadb1ff0b9cf611e87b73cd017e6b7d6adb40e2b3a"},
-    {file = "pandas-1.5.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a40dd1e9f22e01e66ed534d6a965eb99546b41d4d52dbdb66565608fde48203f"},
-    {file = "pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ae7e989f12628f41e804847a8cc2943d362440132919a69429d4dea1f164da0"},
-    {file = "pandas-1.5.2-cp38-cp38-win32.whl", hash = "sha256:530948945e7b6c95e6fa7aa4be2be25764af53fba93fe76d912e35d1c9ee46f5"},
-    {file = "pandas-1.5.2-cp38-cp38-win_amd64.whl", hash = "sha256:73f219fdc1777cf3c45fde7f0708732ec6950dfc598afc50588d0d285fddaefc"},
-    {file = "pandas-1.5.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9608000a5a45f663be6af5c70c3cbe634fa19243e720eb380c0d378666bc7702"},
-    {file = "pandas-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:315e19a3e5c2ab47a67467fc0362cb36c7c60a93b6457f675d7d9615edad2ebe"},
-    {file = "pandas-1.5.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e18bc3764cbb5e118be139b3b611bc3fbc5d3be42a7e827d1096f46087b395eb"},
-    {file = "pandas-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0183cb04a057cc38fde5244909fca9826d5d57c4a5b7390c0cc3fa7acd9fa883"},
-    {file = "pandas-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:344021ed3e639e017b452aa8f5f6bf38a8806f5852e217a7594417fb9bbfa00e"},
-    {file = "pandas-1.5.2-cp39-cp39-win32.whl", hash = "sha256:e7469271497960b6a781eaa930cba8af400dd59b62ec9ca2f4d31a19f2f91090"},
-    {file = "pandas-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:c218796d59d5abd8780170c937b812c9637e84c32f8271bbf9845970f8c1351f"},
-    {file = "pandas-1.5.2.tar.gz", hash = "sha256:220b98d15cee0b2cd839a6358bd1f273d0356bf964c1a1aeb32d47db0215488b"},
+    {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"},
+    {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"},
+    {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"},
+    {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"},
+    {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"},
+    {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"},
+    {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"},
+    {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"},
+    {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"},
+    {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"},
+    {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"},
+    {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"},
+    {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"},
+    {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"},
+    {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"},
+    {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"},
+    {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"},
+    {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"},
+    {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"},
+    {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"},
+    {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"},
+    {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"},
+    {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"},
+    {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"},
+    {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"},
+    {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"},
+    {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"},
+    {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"},
+    {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"},
 ]
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.20.3", markers = "python_version < \"3.10\""},
-    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0,<2", markers = "python_version >= \"3.12\""},
 ]
-python-dateutil = ">=2.8.1"
+python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
+tzdata = ">=2022.7"
 
 [package.extras]
-test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
 
 [[package]]
 name = "pluggy"
@@ -309,7 +379,18 @@ files = [
     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
 
+[[package]]
+name = "tzdata"
+version = "2024.1"
+description = "Provider of IANA time zone data"
+optional = false
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
+    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
+]
+
 [metadata]
 lock-version = "2.0"
-python-versions = ">=3.8.1"
-content-hash = "ea817fae33ad665e6d5f63d35344cb01c65dc873c79d14acaec8bfe09cf478dc"
+python-versions = ">=3.9"
+content-hash = "529eb9776d8049cf0a0998146761d7b0c9afc679b56525d7c57fcdd5feeccc1b"
diff --git a/lib/repack/pyproject.toml b/lib/repack/pyproject.toml
index 9c53cc39073..dd1b6f1fee3 100644
--- a/lib/repack/pyproject.toml
+++ b/lib/repack/pyproject.toml
@@ -10,9 +10,9 @@ repository = "https://github.com/owid/owid-catalog-py"
 homepage = "https://github.com/owid/owid-catalog-py"
 
 [tool.poetry.dependencies]
-python = ">=3.8.1"
+python = ">=3.9"
 numpy = ">=1.24.0"
-pandas = ">=1.5.2"
+pandas = "^2.2.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=7.2.0"
diff --git a/lib/walden/poetry.lock b/lib/walden/poetry.lock
index dc3e4ad0a07..ae51bc6407d 100644
--- a/lib/walden/poetry.lock
+++ b/lib/walden/poetry.lock
@@ -497,34 +497,34 @@ toml = ["tomli"]
 
 [[package]]
 name = "cryptography"
-version = "41.0.6"
+version = "41.0.7"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"},
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"},
-    {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"},
-    {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"},
-    {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"},
+    {file = "cryptography-41.0.7-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c78451b78313fa81607fa1b3f1ae0a5ddd8014c38a02d9db0616133987b9cdf"},
+    {file = "cryptography-41.0.7-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:928258ba5d6f8ae644e764d0f996d61a8777559f72dfeb2eea7e2fe0ad6e782d"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a1b41bc97f1ad230a41657d9155113c7521953869ae57ac39ac7f1bb471469a"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:841df4caa01008bad253bce2a6f7b47f86dc9f08df4b433c404def869f590a15"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5429ec739a29df2e29e15d082f1d9ad683701f0ec7709ca479b3ff2708dae65a"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:43f2552a2378b44869fe8827aa19e69512e3245a219104438692385b0ee119d1"},
+    {file = "cryptography-41.0.7-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:af03b32695b24d85a75d40e1ba39ffe7db7ffcb099fe507b39fd41a565f1b157"},
+    {file = "cryptography-41.0.7-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:49f0805fc0b2ac8d4882dd52f4a3b935b210935d500b6b805f321addc8177406"},
+    {file = "cryptography-41.0.7-cp37-abi3-win32.whl", hash = "sha256:f983596065a18a2183e7f79ab3fd4c475205b839e02cbc0efbbf9666c4b3083d"},
+    {file = "cryptography-41.0.7-cp37-abi3-win_amd64.whl", hash = "sha256:90452ba79b8788fa380dfb587cca692976ef4e757b194b093d845e8d99f612f2"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:079b85658ea2f59c4f43b70f8119a52414cdb7be34da5d019a77bf96d473b960"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b640981bf64a3e978a56167594a0e97db71c89a479da8e175d8bb5be5178c003"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e3114da6d7f95d2dee7d3f4eec16dacff819740bbab931aff8648cb13c5ff5e7"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d5ec85080cce7b0513cfd233914eb8b7bbd0633f1d1703aa28d1dd5a72f678ec"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7a698cb1dac82c35fcf8fe3417a3aaba97de16a01ac914b89a0889d364d2f6be"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:37a138589b12069efb424220bf78eac59ca68b95696fc622b6ccc1c0a197204a"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:68a2dec79deebc5d26d617bfdf6e8aab065a4f34934b22d3b5010df3ba36612c"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:09616eeaef406f99046553b8a40fbf8b1e70795a91885ba4c96a70793de5504a"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48a0476626da912a44cc078f9893f292f0b3e4c739caf289268168d8f4702a39"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c7f3201ec47d5207841402594f1d7950879ef890c0c495052fa62f58283fde1a"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c5ca78485a255e03c32b513f8c2bc39fedb7f5c5f8535545bdc223a03b24f248"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d6c391c021ab1f7a82da5d8d0b3cee2f4b2c455ec86c8aebbc84837a631ff309"},
+    {file = "cryptography-41.0.7.tar.gz", hash = "sha256:13f93ce9bea8016c253b34afc6bd6a75993e5c40672ed5405a9c832f0d4a00bc"},
 ]
 
 [package.dependencies]
diff --git a/poetry.lock b/poetry.lock
index 7caf0a94fcf..b91b85debeb 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1140,34 +1140,34 @@ rich = "*"
 
 [[package]]
 name = "cryptography"
-version = "41.0.6"
+version = "41.0.7"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"},
-    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"},
-    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"},
-    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"},
-    {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"},
-    {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"},
-    {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"},
-    {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"},
-    {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"},
-    {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"},
+    {file = "cryptography-41.0.7-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c78451b78313fa81607fa1b3f1ae0a5ddd8014c38a02d9db0616133987b9cdf"},
+    {file = "cryptography-41.0.7-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:928258ba5d6f8ae644e764d0f996d61a8777559f72dfeb2eea7e2fe0ad6e782d"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a1b41bc97f1ad230a41657d9155113c7521953869ae57ac39ac7f1bb471469a"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:841df4caa01008bad253bce2a6f7b47f86dc9f08df4b433c404def869f590a15"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5429ec739a29df2e29e15d082f1d9ad683701f0ec7709ca479b3ff2708dae65a"},
+    {file = "cryptography-41.0.7-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:43f2552a2378b44869fe8827aa19e69512e3245a219104438692385b0ee119d1"},
+    {file = "cryptography-41.0.7-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:af03b32695b24d85a75d40e1ba39ffe7db7ffcb099fe507b39fd41a565f1b157"},
+    {file = "cryptography-41.0.7-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:49f0805fc0b2ac8d4882dd52f4a3b935b210935d500b6b805f321addc8177406"},
+    {file = "cryptography-41.0.7-cp37-abi3-win32.whl", hash = "sha256:f983596065a18a2183e7f79ab3fd4c475205b839e02cbc0efbbf9666c4b3083d"},
+    {file = "cryptography-41.0.7-cp37-abi3-win_amd64.whl", hash = "sha256:90452ba79b8788fa380dfb587cca692976ef4e757b194b093d845e8d99f612f2"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:079b85658ea2f59c4f43b70f8119a52414cdb7be34da5d019a77bf96d473b960"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b640981bf64a3e978a56167594a0e97db71c89a479da8e175d8bb5be5178c003"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e3114da6d7f95d2dee7d3f4eec16dacff819740bbab931aff8648cb13c5ff5e7"},
+    {file = "cryptography-41.0.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d5ec85080cce7b0513cfd233914eb8b7bbd0633f1d1703aa28d1dd5a72f678ec"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7a698cb1dac82c35fcf8fe3417a3aaba97de16a01ac914b89a0889d364d2f6be"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:37a138589b12069efb424220bf78eac59ca68b95696fc622b6ccc1c0a197204a"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:68a2dec79deebc5d26d617bfdf6e8aab065a4f34934b22d3b5010df3ba36612c"},
+    {file = "cryptography-41.0.7-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:09616eeaef406f99046553b8a40fbf8b1e70795a91885ba4c96a70793de5504a"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48a0476626da912a44cc078f9893f292f0b3e4c739caf289268168d8f4702a39"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c7f3201ec47d5207841402594f1d7950879ef890c0c495052fa62f58283fde1a"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c5ca78485a255e03c32b513f8c2bc39fedb7f5c5f8535545bdc223a03b24f248"},
+    {file = "cryptography-41.0.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d6c391c021ab1f7a82da5d8d0b3cee2f4b2c455ec86c8aebbc84837a631ff309"},
+    {file = "cryptography-41.0.7.tar.gz", hash = "sha256:13f93ce9bea8016c253b34afc6bd6a75993e5c40672ed5405a9c832f0d4a00bc"},
 ]
 
 [package.dependencies]
@@ -3766,7 +3766,7 @@ name = "owid-catalog"
 version = "0.3.9"
 description = "Core data types used by OWID for managing data."
 optional = false
-python-versions = ">=3.8.1, <3.12"
+python-versions = ">=3.9, <3.12"
 files = []
 develop = true
 
@@ -3778,7 +3778,7 @@ ipdb = ">=0.13.9"
 jsonschema = ">=3.2.0"
 mistune = "^3.0.1"
 owid-repack = ">=0.1.1"
-pandas = ">=1.3.3,<2.0"
+pandas = "^2.2.1"
 pyarrow = ">=10.0.1"
 PyYAML = ">=6.0.1"
 rdata = "0.9"
@@ -3795,7 +3795,7 @@ name = "owid-datautils"
 version = "0.5.3"
 description = "Data utils library by the Data Team at Our World in Data"
 optional = false
-python-versions = ">=3.8.1,<4.0"
+python-versions = ">=3.9,<4.0"
 files = []
 develop = true
 
@@ -3805,7 +3805,7 @@ click = "^8.1.7"
 colorama = ">=0.4.4"
 gdown = ">=4.5.2"
 gsheets = ">=0.6.1"
-pandas = ">=1.3.3"
+pandas = "^2.2.1"
 pyarrow = ">=10.0.1"
 pydrive2 = ">=1.15.0"
 structlog = ">=21.5.0"
@@ -3820,13 +3820,13 @@ name = "owid-repack"
 version = "0.1.3"
 description = "Pack Pandas data frames into smaller, more memory-efficient data types."
 optional = false
-python-versions = ">=3.8.1"
+python-versions = ">=3.9"
 files = []
 develop = true
 
 [package.dependencies]
 numpy = ">=1.24.0"
-pandas = ">=1.5.2"
+pandas = "^2.2.1"
 
 [package.source]
 type = "directory"
@@ -3855,50 +3855,75 @@ files = [
 
 [[package]]
 name = "pandas"
-version = "1.5.2"
+version = "2.2.1"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "pandas-1.5.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e9dbacd22555c2d47f262ef96bb4e30880e5956169741400af8b306bbb24a273"},
-    {file = "pandas-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e2b83abd292194f350bb04e188f9379d36b8dfac24dd445d5c87575f3beaf789"},
-    {file = "pandas-1.5.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2552bffc808641c6eb471e55aa6899fa002ac94e4eebfa9ec058649122db5824"},
-    {file = "pandas-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fc87eac0541a7d24648a001d553406f4256e744d92df1df8ebe41829a915028"},
-    {file = "pandas-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0d8fd58df5d17ddb8c72a5075d87cd80d71b542571b5f78178fb067fa4e9c72"},
-    {file = "pandas-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:4aed257c7484d01c9a194d9a94758b37d3d751849c05a0050c087a358c41ad1f"},
-    {file = "pandas-1.5.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:375262829c8c700c3e7cbb336810b94367b9c4889818bbd910d0ecb4e45dc261"},
-    {file = "pandas-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc3cd122bea268998b79adebbb8343b735a5511ec14efb70a39e7acbc11ccbdc"},
-    {file = "pandas-1.5.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b4f5a82afa4f1ff482ab8ded2ae8a453a2cdfde2001567b3ca24a4c5c5ca0db3"},
-    {file = "pandas-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8092a368d3eb7116e270525329a3e5c15ae796ccdf7ccb17839a73b4f5084a39"},
-    {file = "pandas-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6257b314fc14958f8122779e5a1557517b0f8e500cfb2bd53fa1f75a8ad0af2"},
-    {file = "pandas-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:82ae615826da838a8e5d4d630eb70c993ab8636f0eff13cb28aafc4291b632b5"},
-    {file = "pandas-1.5.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:457d8c3d42314ff47cc2d6c54f8fc0d23954b47977b2caed09cd9635cb75388b"},
-    {file = "pandas-1.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c009a92e81ce836212ce7aa98b219db7961a8b95999b97af566b8dc8c33e9519"},
-    {file = "pandas-1.5.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:71f510b0efe1629bf2f7c0eadb1ff0b9cf611e87b73cd017e6b7d6adb40e2b3a"},
-    {file = "pandas-1.5.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a40dd1e9f22e01e66ed534d6a965eb99546b41d4d52dbdb66565608fde48203f"},
-    {file = "pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ae7e989f12628f41e804847a8cc2943d362440132919a69429d4dea1f164da0"},
-    {file = "pandas-1.5.2-cp38-cp38-win32.whl", hash = "sha256:530948945e7b6c95e6fa7aa4be2be25764af53fba93fe76d912e35d1c9ee46f5"},
-    {file = "pandas-1.5.2-cp38-cp38-win_amd64.whl", hash = "sha256:73f219fdc1777cf3c45fde7f0708732ec6950dfc598afc50588d0d285fddaefc"},
-    {file = "pandas-1.5.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9608000a5a45f663be6af5c70c3cbe634fa19243e720eb380c0d378666bc7702"},
-    {file = "pandas-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:315e19a3e5c2ab47a67467fc0362cb36c7c60a93b6457f675d7d9615edad2ebe"},
-    {file = "pandas-1.5.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e18bc3764cbb5e118be139b3b611bc3fbc5d3be42a7e827d1096f46087b395eb"},
-    {file = "pandas-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0183cb04a057cc38fde5244909fca9826d5d57c4a5b7390c0cc3fa7acd9fa883"},
-    {file = "pandas-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:344021ed3e639e017b452aa8f5f6bf38a8806f5852e217a7594417fb9bbfa00e"},
-    {file = "pandas-1.5.2-cp39-cp39-win32.whl", hash = "sha256:e7469271497960b6a781eaa930cba8af400dd59b62ec9ca2f4d31a19f2f91090"},
-    {file = "pandas-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:c218796d59d5abd8780170c937b812c9637e84c32f8271bbf9845970f8c1351f"},
-    {file = "pandas-1.5.2.tar.gz", hash = "sha256:220b98d15cee0b2cd839a6358bd1f273d0356bf964c1a1aeb32d47db0215488b"},
+    {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"},
+    {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"},
+    {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"},
+    {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"},
+    {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"},
+    {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"},
+    {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"},
+    {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"},
+    {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"},
+    {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"},
+    {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"},
+    {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"},
+    {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"},
+    {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"},
+    {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"},
+    {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"},
+    {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"},
+    {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"},
+    {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"},
+    {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"},
+    {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"},
+    {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"},
+    {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"},
+    {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"},
+    {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"},
+    {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"},
+    {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"},
+    {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"},
+    {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"},
 ]
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""},
 ]
-python-dateutil = ">=2.8.1"
+python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
-
-[package.extras]
-test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+tzdata = ">=2022.7"
+
+[package.extras]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
 
 [[package]]
 name = "pandas-stubs"
@@ -4070,79 +4095,80 @@ files = [
 
 [[package]]
 name = "pillow"
-version = "10.2.0"
+version = "10.3.0"
 description = "Python Imaging Library (Fork)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pillow-10.2.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:7823bdd049099efa16e4246bdf15e5a13dbb18a51b68fa06d6c1d4d8b99a796e"},
-    {file = "pillow-10.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:83b2021f2ade7d1ed556bc50a399127d7fb245e725aa0113ebd05cfe88aaf588"},
-    {file = "pillow-10.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fad5ff2f13d69b7e74ce5b4ecd12cc0ec530fcee76356cac6742785ff71c452"},
-    {file = "pillow-10.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da2b52b37dad6d9ec64e653637a096905b258d2fc2b984c41ae7d08b938a67e4"},
-    {file = "pillow-10.2.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:47c0995fc4e7f79b5cfcab1fc437ff2890b770440f7696a3ba065ee0fd496563"},
-    {file = "pillow-10.2.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:322bdf3c9b556e9ffb18f93462e5f749d3444ce081290352c6070d014c93feb2"},
-    {file = "pillow-10.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:51f1a1bffc50e2e9492e87d8e09a17c5eea8409cda8d3f277eb6edc82813c17c"},
-    {file = "pillow-10.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:69ffdd6120a4737710a9eee73e1d2e37db89b620f702754b8f6e62594471dee0"},
-    {file = "pillow-10.2.0-cp310-cp310-win32.whl", hash = "sha256:c6dafac9e0f2b3c78df97e79af707cdc5ef8e88208d686a4847bab8266870023"},
-    {file = "pillow-10.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:aebb6044806f2e16ecc07b2a2637ee1ef67a11840a66752751714a0d924adf72"},
-    {file = "pillow-10.2.0-cp310-cp310-win_arm64.whl", hash = "sha256:7049e301399273a0136ff39b84c3678e314f2158f50f517bc50285fb5ec847ad"},
-    {file = "pillow-10.2.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:35bb52c37f256f662abdfa49d2dfa6ce5d93281d323a9af377a120e89a9eafb5"},
-    {file = "pillow-10.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c23f307202661071d94b5e384e1e1dc7dfb972a28a2310e4ee16103e66ddb67"},
-    {file = "pillow-10.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:773efe0603db30c281521a7c0214cad7836c03b8ccff897beae9b47c0b657d61"},
-    {file = "pillow-10.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11fa2e5984b949b0dd6d7a94d967743d87c577ff0b83392f17cb3990d0d2fd6e"},
-    {file = "pillow-10.2.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:716d30ed977be8b37d3ef185fecb9e5a1d62d110dfbdcd1e2a122ab46fddb03f"},
-    {file = "pillow-10.2.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a086c2af425c5f62a65e12fbf385f7c9fcb8f107d0849dba5839461a129cf311"},
-    {file = "pillow-10.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c8de2789052ed501dd829e9cae8d3dcce7acb4777ea4a479c14521c942d395b1"},
-    {file = "pillow-10.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609448742444d9290fd687940ac0b57fb35e6fd92bdb65386e08e99af60bf757"},
-    {file = "pillow-10.2.0-cp311-cp311-win32.whl", hash = "sha256:823ef7a27cf86df6597fa0671066c1b596f69eba53efa3d1e1cb8b30f3533068"},
-    {file = "pillow-10.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:1da3b2703afd040cf65ec97efea81cfba59cdbed9c11d8efc5ab09df9509fc56"},
-    {file = "pillow-10.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:edca80cbfb2b68d7b56930b84a0e45ae1694aeba0541f798e908a49d66b837f1"},
-    {file = "pillow-10.2.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:1b5e1b74d1bd1b78bc3477528919414874748dd363e6272efd5abf7654e68bef"},
-    {file = "pillow-10.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0eae2073305f451d8ecacb5474997c08569fb4eb4ac231ffa4ad7d342fdc25ac"},
-    {file = "pillow-10.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b7c2286c23cd350b80d2fc9d424fc797575fb16f854b831d16fd47ceec078f2c"},
-    {file = "pillow-10.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e23412b5c41e58cec602f1135c57dfcf15482013ce6e5f093a86db69646a5aa"},
-    {file = "pillow-10.2.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:52a50aa3fb3acb9cf7213573ef55d31d6eca37f5709c69e6858fe3bc04a5c2a2"},
-    {file = "pillow-10.2.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:127cee571038f252a552760076407f9cff79761c3d436a12af6000cd182a9d04"},
-    {file = "pillow-10.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:8d12251f02d69d8310b046e82572ed486685c38f02176bd08baf216746eb947f"},
-    {file = "pillow-10.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:54f1852cd531aa981bc0965b7d609f5f6cc8ce8c41b1139f6ed6b3c54ab82bfb"},
-    {file = "pillow-10.2.0-cp312-cp312-win32.whl", hash = "sha256:257d8788df5ca62c980314053197f4d46eefedf4e6175bc9412f14412ec4ea2f"},
-    {file = "pillow-10.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:154e939c5f0053a383de4fd3d3da48d9427a7e985f58af8e94d0b3c9fcfcf4f9"},
-    {file = "pillow-10.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:f379abd2f1e3dddb2b61bc67977a6b5a0a3f7485538bcc6f39ec76163891ee48"},
-    {file = "pillow-10.2.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8373c6c251f7ef8bda6675dd6d2b3a0fcc31edf1201266b5cf608b62a37407f9"},
-    {file = "pillow-10.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:870ea1ada0899fd0b79643990809323b389d4d1d46c192f97342eeb6ee0b8483"},
-    {file = "pillow-10.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4b6b1e20608493548b1f32bce8cca185bf0480983890403d3b8753e44077129"},
-    {file = "pillow-10.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3031709084b6e7852d00479fd1d310b07d0ba82765f973b543c8af5061cf990e"},
-    {file = "pillow-10.2.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:3ff074fc97dd4e80543a3e91f69d58889baf2002b6be64347ea8cf5533188213"},
-    {file = "pillow-10.2.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:cb4c38abeef13c61d6916f264d4845fab99d7b711be96c326b84df9e3e0ff62d"},
-    {file = "pillow-10.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b1b3020d90c2d8e1dae29cf3ce54f8094f7938460fb5ce8bc5c01450b01fbaf6"},
-    {file = "pillow-10.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:170aeb00224ab3dc54230c797f8404507240dd868cf52066f66a41b33169bdbe"},
-    {file = "pillow-10.2.0-cp38-cp38-win32.whl", hash = "sha256:c4225f5220f46b2fde568c74fca27ae9771536c2e29d7c04f4fb62c83275ac4e"},
-    {file = "pillow-10.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:0689b5a8c5288bc0504d9fcee48f61a6a586b9b98514d7d29b840143d6734f39"},
-    {file = "pillow-10.2.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:b792a349405fbc0163190fde0dc7b3fef3c9268292586cf5645598b48e63dc67"},
-    {file = "pillow-10.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c570f24be1e468e3f0ce7ef56a89a60f0e05b30a3669a459e419c6eac2c35364"},
-    {file = "pillow-10.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8ecd059fdaf60c1963c58ceb8997b32e9dc1b911f5da5307aab614f1ce5c2fb"},
-    {file = "pillow-10.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c365fd1703040de1ec284b176d6af5abe21b427cb3a5ff68e0759e1e313a5e7e"},
-    {file = "pillow-10.2.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:70c61d4c475835a19b3a5aa42492409878bbca7438554a1f89d20d58a7c75c01"},
-    {file = "pillow-10.2.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b6f491cdf80ae540738859d9766783e3b3c8e5bd37f5dfa0b76abdecc5081f13"},
-    {file = "pillow-10.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9d189550615b4948f45252d7f005e53c2040cea1af5b60d6f79491a6e147eef7"},
-    {file = "pillow-10.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:49d9ba1ed0ef3e061088cd1e7538a0759aab559e2e0a80a36f9fd9d8c0c21591"},
-    {file = "pillow-10.2.0-cp39-cp39-win32.whl", hash = "sha256:babf5acfede515f176833ed6028754cbcd0d206f7f614ea3447d67c33be12516"},
-    {file = "pillow-10.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:0304004f8067386b477d20a518b50f3fa658a28d44e4116970abfcd94fac34a8"},
-    {file = "pillow-10.2.0-cp39-cp39-win_arm64.whl", hash = "sha256:0fb3e7fc88a14eacd303e90481ad983fd5b69c761e9e6ef94c983f91025da869"},
-    {file = "pillow-10.2.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:322209c642aabdd6207517e9739c704dc9f9db943015535783239022002f054a"},
-    {file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3eedd52442c0a5ff4f887fab0c1c0bb164d8635b32c894bc1faf4c618dd89df2"},
-    {file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb28c753fd5eb3dd859b4ee95de66cc62af91bcff5db5f2571d32a520baf1f04"},
-    {file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:33870dc4653c5017bf4c8873e5488d8f8d5f8935e2f1fb9a2208c47cdd66efd2"},
-    {file = "pillow-10.2.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:3c31822339516fb3c82d03f30e22b1d038da87ef27b6a78c9549888f8ceda39a"},
-    {file = "pillow-10.2.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a2b56ba36e05f973d450582fb015594aaa78834fefe8dfb8fcd79b93e64ba4c6"},
-    {file = "pillow-10.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:d8e6aeb9201e655354b3ad049cb77d19813ad4ece0df1249d3c793de3774f8c7"},
-    {file = "pillow-10.2.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:2247178effb34a77c11c0e8ac355c7a741ceca0a732b27bf11e747bbc950722f"},
-    {file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15587643b9e5eb26c48e49a7b33659790d28f190fc514a322d55da2fb5c2950e"},
-    {file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753cd8f2086b2b80180d9b3010dd4ed147efc167c90d3bf593fe2af21265e5a5"},
-    {file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7c8f97e8e7a9009bcacbe3766a36175056c12f9a44e6e6f2d5caad06dcfbf03b"},
-    {file = "pillow-10.2.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d1b35bcd6c5543b9cb547dee3150c93008f8dd0f1fef78fc0cd2b141c5baf58a"},
-    {file = "pillow-10.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fe4c15f6c9285dc54ce6553a3ce908ed37c8f3825b5a51a15c91442bb955b868"},
-    {file = "pillow-10.2.0.tar.gz", hash = "sha256:e87f0b2c78157e12d7686b27d63c070fd65d994e8ddae6f328e0dcf4a0cd007e"},
+    {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"},
+    {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"},
+    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"},
+    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"},
+    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"},
+    {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"},
+    {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"},
+    {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"},
+    {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"},
+    {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"},
+    {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"},
+    {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"},
+    {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"},
+    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"},
+    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"},
+    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"},
+    {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"},
+    {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"},
+    {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"},
+    {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"},
+    {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"},
+    {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"},
+    {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"},
+    {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"},
+    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"},
+    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"},
+    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"},
+    {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"},
+    {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"},
+    {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"},
+    {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"},
+    {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"},
+    {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"},
+    {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"},
+    {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"},
+    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"},
+    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"},
+    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"},
+    {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"},
+    {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"},
+    {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"},
+    {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"},
+    {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"},
+    {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"},
+    {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"},
+    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"},
+    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"},
+    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"},
+    {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"},
+    {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"},
+    {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"},
+    {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"},
+    {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"},
+    {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"},
+    {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"},
+    {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"},
+    {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"},
 ]
 
 [package.extras]
@@ -6948,6 +6974,17 @@ files = [
 mypy-extensions = ">=0.3.0"
 typing-extensions = ">=3.7.4"
 
+[[package]]
+name = "tzdata"
+version = "2024.1"
+description = "Provider of IANA time zone data"
+optional = false
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
+    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
+]
+
 [[package]]
 name = "unidecode"
 version = "1.3.6"
@@ -7606,4 +7643,4 @@ test = ["pytest", "pytest-cov"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10, <3.12"
-content-hash = "6604150041608aef717982c0ecfd530d14a1f5dec4c0d2bb0582124ca90fce20"
+content-hash = "3bb4ebd59a73084c09c771e937db80b733f3d25d21135710f74fd4cbfd5b34f8"
diff --git a/pyproject.toml b/pyproject.toml
index 6e67f75dced..48c2715cefc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,8 +32,6 @@ python-dotenv = ">=0.19.0"
 frictionless = {version = "^4.40.8", extras = ["pandas"]}
 regex = ">=2022.1.18"
 wikipedia = ">=1.4.0"
-# pandas updates can be dangerous, it is better to pin the version and carefully update it manually
-pandas = "1.5.2"
 numpy = ">=1.22.1"
 pydantic = ">=1.9.0"
 structlog = ">=21.5.0"
@@ -71,6 +69,7 @@ rioxarray = "^0.15.1"
 tiktoken = "^0.6.0"
 html2text = "^2020.1.16"
 pygithub = "^2.3.0"
+pandas = "2.2.1"
 
 [tool.poetry.group.api.dependencies]
 fastapi = "^0.109.0"

From cf4b9b765f70db0480f0af6ba0f7c89fa3ae1750 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Thu, 11 Apr 2024 04:03:46 +0000
Subject: [PATCH 10/61] :robot: automatic wildfires update

---
 snapshots/climate/latest/weekly_wildfires.csv.dvc             | 4 ++--
 snapshots/excess_mortality/latest/hmd_stmf.csv.dvc            | 2 +-
 snapshots/excess_mortality/latest/wmd.csv.dvc                 | 2 +-
 snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc  | 2 +-
 .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc   | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
index 45fb735c98e..73b92d25225 100644
--- a/snapshots/climate/latest/weekly_wildfires.csv.dvc
+++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -9,8 +9,8 @@ meta:
     citation_full: Global Wildfire Information System
     attribution_short: GWIS
     url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
-    date_accessed: 2024-04-10
-    date_published: 2024-04-10
+    date_accessed: 2024-04-11
+    date_published: 2024-04-11
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
index 070ccb99ce4..b36f25da684 100644
--- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
+++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,7 +13,7 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-04-10
+    date_accessed: 2024-04-11
     publication_date: 2024-03-18
     publication_year: 2024
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
index 8f957225ab9..9b63532ccee 100644
--- a/snapshots/excess_mortality/latest/wmd.csv.dvc
+++ b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-04-10
+    date_accessed: 2024-04-11
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
index 238de18b5f5..581bd5f1206 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-04-10
+    date_accessed: 2024-04-11
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
index 9d1cd01c148..d52caeb9c77 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-04-10
+    date_accessed: 2024-04-11
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

From 30f2dcf7d583090769d3a6dc42e60ccfa28ac696 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Thu, 11 Apr 2024 04:05:43 +0000
Subject: [PATCH 11/61] :robot: automatic flunet update

---
 snapshots/who/latest/fluid.csv.dvc  | 4 ++--
 snapshots/who/latest/flunet.csv.dvc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
index d4df3f712c6..a1752d6a8ea 100644
--- a/snapshots/who/latest/fluid.csv.dvc
+++ b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: c871c20f9342720af8d2634b4641d004
-    size: 150197770
+  - md5: aa9466d41960d48a594b05b7153b50f0
+    size: 150634788
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
index c5c0f09bca9..10d86c6a41c 100644
--- a/snapshots/who/latest/flunet.csv.dvc
+++ b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 61a80a627866aec81a5fd99e8f169041
-    size: 25729116
+  - md5: 2f87106c852ce3bd068bf8f8e6771d7e
+    size: 25764329
     path: flunet.csv

From 917387fe56705c0e18964e0d59f9c43256e51f9d Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Thu, 11 Apr 2024 09:46:47 +0200
Subject: [PATCH 12/61] :sparkles: speed up PathFinder (#2513)

* :sparkles: speed up PathFinder

* reuse engine
---
 etl/grapher_helpers.py | 30 ++++++++++++++++++------------
 etl/helpers.py         | 17 ++++++++++++-----
 etl/steps/__init__.py  |  4 +++-
 tests/test_helpers.py  |  4 ++--
 4 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py
index 064ca1401ad..ced1c8bf3f1 100644
--- a/etl/grapher_helpers.py
+++ b/etl/grapher_helpers.py
@@ -12,8 +12,9 @@
 from jinja2 import Environment
 from owid import catalog
 from owid.catalog.utils import underscore
+from sqlalchemy.engine import Engine
 
-from etl.db import get_connection, read_sql
+from etl.db import get_engine, read_sql
 from etl.db_utils import DBUtils
 from etl.files import checksum_str
 
@@ -301,17 +302,21 @@ def long_to_wide_tables(
         yield cast(catalog.Table, t)
 
 
-def _get_entities_from_db(countries: Set[str], by: Literal["name", "code"]) -> Dict[str, int]:
+def _get_entities_from_db(
+    countries: Set[str], by: Literal["name", "code"], engine: Engine | None = None
+) -> Dict[str, int]:
     q = f"select id as entity_id, {by} from entities where {by} in %(names)s"
-    df = read_sql(q, params={"names": list(countries)})
+    df = read_sql(q, engine, params={"names": list(countries)})
     return cast(Dict[str, int], df.set_index(by).entity_id.to_dict())
 
 
-def _get_and_create_entities_in_db(countries: Set[str]) -> Dict[str, int]:
-    cursor = get_connection().cursor()
-    db = DBUtils(cursor)
-    log.info("Creating entities in DB", countries=countries)
-    return {name: db.get_or_create_entity(name) for name in countries}
+def _get_and_create_entities_in_db(countries: Set[str], engine: Engine | None = None) -> Dict[str, int]:
+    engine = engine or get_engine()
+    with engine.connect() as con:
+        cursor = con.connection.cursor()
+        db = DBUtils(cursor)
+        log.info("Creating entities in DB", countries=countries)
+        return {name: db.get_or_create_entity(name) for name in countries}
 
 
 def country_to_entity_id(
@@ -319,6 +324,7 @@ def country_to_entity_id(
     create_entities: bool = False,
     errors: Literal["raise", "ignore", "warn"] = "raise",
     by: Literal["name", "code"] = "name",
+    engine: Engine | None = None,
 ) -> pd.Series:
     """Convert country name to grapher entity_id. Most of countries should be in countries_regions.csv,
     however some regions could be only in `entities` table in MySQL or doesn't exist at all.
@@ -331,7 +337,7 @@ def country_to_entity_id(
     :param by: use `name` if you use country names, `code` if you use ISO codes
     """
     # fill entities from DB
-    db_entities = _get_entities_from_db(set(country), by=by)
+    db_entities = _get_entities_from_db(set(country), by=by, engine=engine)
     entity_id = country.map(db_entities).astype(float)
 
     # create entities in DB
@@ -339,7 +345,7 @@ def country_to_entity_id(
         assert by == "name", "create_entities works only with `by='name'`"
         ix = entity_id.isnull()
         # cast to float to fix issues with categories
-        entity_id[ix] = country[ix].map(_get_and_create_entities_in_db(set(country[ix]))).astype(float)
+        entity_id[ix] = country[ix].map(_get_and_create_entities_in_db(set(country[ix]), engine=engine)).astype(float)
 
     if entity_id.isnull().any():
         msg = f"Some countries have not been mapped: {set(country[entity_id.isnull()])}"
@@ -460,7 +466,7 @@ def _adapt_dataset_metadata_for_grapher(
 
 
 def _adapt_table_for_grapher(
-    table: catalog.Table, country_col: str = "country", year_col: str = "year"
+    table: catalog.Table, engine: Engine | None = None, country_col: str = "country", year_col: str = "year"
 ) -> catalog.Table:
     """Adapt table (from a garden dataset) to be used in a grapher step. This function
     is not meant to be run explicitly, but by default in the grapher step.
@@ -501,7 +507,7 @@ def _adapt_table_for_grapher(
     table[country_col] = table[country_col].astype(str)
 
     # Grapher needs a column entity id, that is constructed based on the unique entity names in the database.
-    table["entity_id"] = country_to_entity_id(table[country_col], create_entities=True)
+    table["entity_id"] = country_to_entity_id(table[country_col], create_entities=True, engine=engine)
     table = table.drop(columns=[country_col]).rename(columns={year_col: "year"})
 
     table = table.set_index(["entity_id", "year"] + dim_names)
diff --git a/etl/helpers.py b/etl/helpers.py
index e769fa63c70..74af9bff9b8 100644
--- a/etl/helpers.py
+++ b/etl/helpers.py
@@ -390,11 +390,8 @@ class PathFinder:
     def __init__(self, __file__: str, is_private: Optional[bool] = None):
         self.f = Path(__file__)
 
-        # Load dag.
-        if "/archive/" in __file__:
-            self.dag = load_dag(paths.DAG_ARCHIVE_FILE)
-        else:
-            self.dag = load_dag()
+        # Lazy load dag when needed.
+        self._dag = None
 
         # Current file should be a data step.
         if not self.f.as_posix().startswith(paths.STEP_DIR.as_posix()):
@@ -412,6 +409,16 @@ def __init__(self, __file__: str, is_private: Optional[bool] = None):
         # Default logger
         self.log = structlog.get_logger(step=f"{self.namespace}/{self.channel}/{self.version}/{self.short_name}")
 
+    @property
+    def dag(self):
+        """Lazy loading of DAG."""
+        if self._dag is None:
+            if "/archive/" in str(self.f):
+                self._dag = load_dag(paths.DAG_ARCHIVE_FILE)
+            else:
+                self._dag = load_dag()
+        return self._dag
+
     @property
     def channel(self) -> str:
         return self.f.parent.parent.parent.name
diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py
index 92b83cf33cc..7ef1f5f63b7 100644
--- a/etl/steps/__init__.py
+++ b/etl/steps/__init__.py
@@ -14,6 +14,7 @@
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
+from functools import cache
 from glob import glob
 from importlib import import_module
 from pathlib import Path
@@ -130,6 +131,7 @@ def traverse(graph: Graph, nodes: Set[str]) -> Graph:
     return dict(reachable)
 
 
+@cache
 def load_dag(filename: Union[str, Path] = paths.DEFAULT_DAG_FILE) -> Dict[str, Any]:
     return _load_dag(filename, {})
 
@@ -827,7 +829,7 @@ def run(self) -> None:
                     cols += [c for c in table.columns if c in {"year", "country"} and c not in cols]
                     table = table.loc[:, cols]
 
-                table = gh._adapt_table_for_grapher(table)
+                table = gh._adapt_table_for_grapher(table, engine)
 
                 for t in gh._yield_wide_table(table, na_action="drop"):
                     i += 1
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index a94f0056836..b2df9ebede9 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -73,7 +73,7 @@ def test_create_dataset(tmp_path):
 def test_PathFinder_with_private_steps():
     pf = PathFinder(str(paths.STEP_DIR / "data/garden/namespace/2023/name/__init__.py"))
 
-    pf.dag = {
+    pf._dag = {
         "data://garden/namespace/2023/name": {
             "snapshot://namespace/2023/snapshot_a",
             "snapshot-private://namespace/2023/snapshot_b",
@@ -88,7 +88,7 @@ def test_PathFinder_with_private_steps():
     # assume it's public, unless explicitly stated otherwise.
     assert pf.get_dependency_step_name("snapshot_a", is_private=True) == "snapshot-private://namespace/2023/snapshot_a"
 
-    pf.dag = {
+    pf._dag = {
         "data-private://garden/namespace/2023/name": {
             "snapshot-private://namespace/2023/name",
         }

From 9cdbaad31ecca15b0fec94a3951a062f23b6834c Mon Sep 17 00:00:00 2001
From: Pablo Arriagada <63430031+paarriagadap@users.noreply.github.com>
Date: Thu, 11 Apr 2024 09:45:42 -0400
Subject: [PATCH 13/61] =?UTF-8?q?=F0=9F=93=8A=20oecd:=20Update=20Income=20?=
 =?UTF-8?q?Distribution=20Database=20(#2509)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update OECD Income Distribution Database

* :bug: replace old version

* :sparkles: archive old steps

* :lipstick: Adding Pablo's suggestions
---
 .../{{cookiecutter.short_name}}.py            |   2 +-
 .../{{cookiecutter.short_name}}.py            |   2 +-
 dag/archive/poverty_inequality.yml            |   8 +
 dag/poverty_inequality.yml                    |  14 +-
 ...ncome_distribution_database.countries.json |  47 +++
 .../income_distribution_database.meta.yml     | 366 ++++++++++++++++++
 .../income_distribution_database.py           | 211 ++++++++++
 .../income_distribution_database.py           |  28 ++
 .../income_distribution_database.py           |  43 ++
 .../income_distribution_database.csv.dvc      |  34 ++
 .../income_distribution_database.py           |  60 +++
 11 files changed, 806 insertions(+), 9 deletions(-)
 create mode 100644 dag/archive/poverty_inequality.yml
 create mode 100644 etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.countries.json
 create mode 100644 etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.meta.yml
 create mode 100644 etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py
 create mode 100644 etl/steps/data/grapher/oecd/2024-04-10/income_distribution_database.py
 create mode 100644 etl/steps/data/meadow/oecd/2024-04-10/income_distribution_database.py
 create mode 100644 snapshots/oecd/2024-04-10/income_distribution_database.csv.dvc
 create mode 100644 snapshots/oecd/2024-04-10/income_distribution_database.py

diff --git a/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py
index 2464ce7261a..fad0174bcdc 100644
--- a/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py
+++ b/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py
@@ -23,7 +23,7 @@ def run(dest_dir: str) -> None:
     tb = geo.harmonize_countries(
         df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
     )
-    tb = tb.set_index(["country", "year"], verify_integrity=True)
+    tb = tb.format(["country", "year"])
 
     #
     # Save outputs.
diff --git a/apps/wizard/etl_steps/cookiecutter/meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/apps/wizard/etl_steps/cookiecutter/meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py
index c318ba23bc9..66b81ab09f5 100644
--- a/apps/wizard/etl_steps/cookiecutter/meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py
+++ b/apps/wizard/etl_steps/cookiecutter/meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py
@@ -20,7 +20,7 @@ def run(dest_dir: str) -> None:
     # Process data.
     #
     # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
-    tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index()
+    tb = tb.format(["country", "year"])
 
     #
     # Save outputs.
diff --git a/dag/archive/poverty_inequality.yml b/dag/archive/poverty_inequality.yml
new file mode 100644
index 00000000000..0d09af23528
--- /dev/null
+++ b/dag/archive/poverty_inequality.yml
@@ -0,0 +1,8 @@
+steps:
+  # OECD Income Distribution Database
+  data://meadow/oecd/2023-06-06/income_distribution_database:
+    - snapshot://oecd/2023-06-06/income_distribution_database.csv
+  data://garden/oecd/2023-06-06/income_distribution_database:
+    - data://meadow/oecd/2023-06-06/income_distribution_database
+  data://grapher/oecd/2023-06-06/income_distribution_database:
+    - data://garden/oecd/2023-06-06/income_distribution_database
diff --git a/dag/poverty_inequality.yml b/dag/poverty_inequality.yml
index f4cf015baa2..d44b0e1f3b0 100644
--- a/dag/poverty_inequality.yml
+++ b/dag/poverty_inequality.yml
@@ -66,13 +66,13 @@ steps:
   data://grapher/ophi/2023-07-05/multidimensional_poverty_index:
     - data://garden/ophi/2023-07-05/multidimensional_poverty_index
 
-  # OECD Income Distribution Database
-  data://meadow/oecd/2023-06-06/income_distribution_database:
-    - snapshot://oecd/2023-06-06/income_distribution_database.csv
-  data://garden/oecd/2023-06-06/income_distribution_database:
-    - data://meadow/oecd/2023-06-06/income_distribution_database
-  data://grapher/oecd/2023-06-06/income_distribution_database:
-    - data://garden/oecd/2023-06-06/income_distribution_database
+  # # OECD Income Distribution Database
+  data://meadow/oecd/2024-04-10/income_distribution_database:
+    - snapshot://oecd/2024-04-10/income_distribution_database.csv
+  data://garden/oecd/2024-04-10/income_distribution_database:
+    - data://meadow/oecd/2024-04-10/income_distribution_database
+  data://grapher/oecd/2024-04-10/income_distribution_database:
+    - data://garden/oecd/2024-04-10/income_distribution_database
 
   # Historical poverty data - Moatsos (2021)
   data://meadow/moatsos/2023-10-09/moatsos_historical_poverty:
diff --git a/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.countries.json b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.countries.json
new file mode 100644
index 00000000000..8ac42f96724
--- /dev/null
+++ b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.countries.json
@@ -0,0 +1,47 @@
+{
+  "Australia": "Australia",
+  "Austria": "Austria",
+  "Belgium": "Belgium",
+  "Brazil": "Brazil",
+  "Bulgaria": "Bulgaria",
+  "Canada": "Canada",
+  "Chile": "Chile",
+  "Costa Rica": "Costa Rica",
+  "Croatia": "Croatia",
+  "Czechia": "Czechia",
+  "Denmark": "Denmark",
+  "Estonia": "Estonia",
+  "Finland": "Finland",
+  "France": "France",
+  "Germany": "Germany",
+  "Greece": "Greece",
+  "Hungary": "Hungary",
+  "Iceland": "Iceland",
+  "India": "India",
+  "Ireland": "Ireland",
+  "Israel": "Israel",
+  "Italy": "Italy",
+  "Japan": "Japan",
+  "Latvia": "Latvia",
+  "Lithuania": "Lithuania",
+  "Luxembourg": "Luxembourg",
+  "Mexico": "Mexico",
+  "Netherlands": "Netherlands",
+  "New Zealand": "New Zealand",
+  "Norway": "Norway",
+  "Poland": "Poland",
+  "Portugal": "Portugal",
+  "Romania": "Romania",
+  "Russia": "Russia",
+  "Slovak Republic": "Slovakia",
+  "Slovenia": "Slovenia",
+  "South Africa": "South Africa",
+  "Spain": "Spain",
+  "Sweden": "Sweden",
+  "Switzerland": "Switzerland",
+  "United Kingdom": "United Kingdom",
+  "United States": "United States",
+  "China (People\u2019s Republic of)": "China",
+  "Korea": "South Korea",
+  "T\u00fcrkiye": "Turkey"
+}
\ No newline at end of file
diff --git a/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.meta.yml b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.meta.yml
new file mode 100644
index 00000000000..35c69c2470e
--- /dev/null
+++ b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.meta.yml
@@ -0,0 +1,366 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    processing_level: minor
+    display: &common-display
+      tolerance: 5
+    presentation:
+      topic_tags:
+        - Economic Inequality
+        - Poverty
+
+  gini: |-
+    The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality.
+  disposable_income: |-
+    Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received.
+  market_income: |-
+    Income is ‘pre-tax’ — measured before taxes have been paid and most government benefits have been received. However, data for China, Hungary, Mexico, Turkey as well as part of the data for Greece refer to the income post taxes and before transfers.
+  gross_income: |-
+    Income here is measured before taxes and after benefits.
+  equivalization: |-
+    Income has been equivalized – adjusted to account for the fact that people in the same household can share costs like rent and heating.
+  additional_info: |-
+    {definitions.age_groups}, and also the income definition is the newest from the OECD since 2012. For more information on the methodology, visit the [OECD Income Distribution Database (IDD)](http://www.oecd.org/social/income-distribution-database.htm).
+  covid: |-
+    Survey estimates for 2020 are subject to additional uncertainty and are to be treated with extra caution, as in most countries the survey fieldwork was affected by the Coronavirus (COVID-19) pandemic.
+
+  age_groups: |-
+    <% if age == "Total" %>
+    The entire population of each country is considered
+    <% elif age == "Working population" %>
+    Only working-age population is considered (from 18 to 65 years old)
+    <% elif age == "Over 65 years" %>
+    Only population over 65 years old is considered
+    <%- endif -%>
+
+  source_gini: |-
+    The Gini coefficient is based on the comparison of cumulative proportions of the population against cumulative proportions of income they receive, and it ranges between 0 in the case of perfect equality and 1 in the case of perfect inequality.
+  source_gini_market: |-
+    The Gini coefficient for market income refers to income before taxes and transfers. However, data for China, Hungary, Mexico and Turkey as well as data for Greece from the Household Budget Survey refer to the income post taxes and before transfers.
+  source_palma_ratio: |-
+    The Palma ratio is the share of all income received by the 10% people with highest disposable income divided by the share of all income received by the 40% people with the lowest disposable income.
+  source_p90_p10_ratio: |-
+    The P90/P10 ratio is the ratio of the upper bound value of the ninth decile (i.e. the 10% of people with highest income) to that of the upper bound value of the first decile.
+  source_p90_p50_ratio: |-
+    The P90/P50 ratio is the ratio of the upper bound value of the ninth decile to the median income.
+  source_p50_p10_ratio: |-
+    The P50/P10 ratio is the ratio of median income to the upper bound value of the first decile.
+  source_s80_s20_ratio: |-
+    The S80/S20 ratio is the share of all income received by the top quintile divided by the share of the first, or the ratio of the average income of the top quintile to that of the first.
+  source_headcount_ratio_market: |-
+    Data for Hungary, Mexico and Turkey as well as data for Greece from the Household Budget Survey refer to the income post taxes and before transfers.
+  source_income_definition_2012: |-
+    Data calculated according to the new OECD Terms of reference. Compared to previous terms of reference, these include a more detailed breakdown of current transfers received and paid by households as well as a revised definition of household income, including the value of goods produced for own consumption as an element of self-employed income.
+  source_recommended_uses_and_limitations: |-
+    The OECD Income Distribution database (IDD) has been developed to benchmark and monitor countries’ performance in the field of income inequality and poverty. It contains a number of standardised indicators based on the central concept of “equivalised household disposable income”, i.e. the total income received by the households less the current taxes and transfers they pay, adjusted for household size with an equivalence scale. While household income is only one of the factors shaping people’s economic well-being, it is also the one for which comparable data for all OECD countries are most common. Income distribution has a long-standing tradition among household-level statistics, with regular data collections going back to the 1980s (and sometimes earlier) in many OECD countries.
+
+    Achieving comparability in this field is a challenge, as national practices differ widely in terms of concepts, measures, and statistical sources. In order to maximise international comparability as well as inter-temporal consistency of data, the IDD data collection and compilation process is based on a common set of statistical conventions (e.g. on income concepts and components). The information obtained by the OECD through a network of national data providers, via a standardized questionnaire, is based on national sources that are deemed to be most representative for each country.
+
+    Small changes in estimates between years should be treated with caution as they may not be statistically significant.
+
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+
+
+
+tables:
+  income_distribution_database:
+    variables:
+      gini_disposable:
+        title: Gini coefficient (disposable income) - <<age>>
+        unit: ""
+        short_unit: ""
+        description_short: "{definitions.gini}"
+        description_key:
+          - "{definitions.disposable_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_gini}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: Gini coefficient (disposable income)
+        display:
+          name: Gini coefficient (disposable income)
+          numDecimalPlaces: 2
+          <<: *common-display
+
+      gini_gross:
+        title: Gini coefficient (gross income) - <<age>>
+        unit: ""
+        short_unit: ""
+        description_short: "{definitions.gini}"
+        description_key:
+          - "{definitions.gross_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_gini}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: Gini coefficient (gross income)
+        display:
+          name: Gini coefficient (gross income)
+          numDecimalPlaces: 2
+          <<: *common-display
+
+      gini_market:
+        title: Gini coefficient (market income) - <<age>>
+        unit: ""
+        short_unit: ""
+        description_short: "{definitions.gini}"
+        description_key:
+          - "{definitions.market_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_gini}
+
+          {definitions.source_gini_market}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: Gini coefficient (market income)
+        display:
+          name: Gini coefficient (market income)
+          numDecimalPlaces: 2
+          <<: *common-display
+
+      p50_p10_ratio_disposable:
+        title: P50/P10 ratio (disposable income) - <<age>>
+        unit: ""
+        short_unit: ""
+        description_short: The P50/P10 ratio measures the degree of inequality within the poorest half of the population. A ratio of 2 means that the median income or consumption is two times higher than that of someone just falling in the poorest tenth of the population.
+        description_key:
+          - "{definitions.disposable_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_p50_p10_ratio}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: P50/P10 ratio (disposable income)
+        display:
+          name: P50/P10 ratio (disposable income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      p90_p10_ratio_disposable:
+        title: P90/P10 ratio (disposable income) - <<age>>
+        unit: ""
+        short_unit: ""
+        description_short: P90 and P10 are the levels of income or consumption below which 90% and 10% of the population live, respectively. This variable gives the ratio of the two. It is a measure of inequality that indicates the gap between the richest and poorest tenth of the population.
+        description_key:
+          - "{definitions.disposable_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_p90_p10_ratio}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: P90/P10 ratio (disposable income)
+        display:
+          name: P90/P10 ratio (disposable income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      p90_p50_ratio_disposable:
+        title: P90/P50 ratio (disposable income) - <<age>>
+        unit: ""
+        short_unit: ""
+        description_short: The P90/P50 ratio measures the degree of inequality within the richest half of the population. A ratio of 2 means that someone just falling in the richest tenth of the population has twice the median income or consumption.
+        description_key:
+          - "{definitions.disposable_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_p90_p50_ratio}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: P90/P50 ratio (disposable income)
+        display:
+          name: P90/P50 ratio (disposable income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      palma_ratio_disposable:
+        title: Palma ratio (disposable income) - <<age>>
+        unit: ""
+        short_unit: ""
+        description_short: The Palma ratio is a measure of inequality that divides the share received by the richest 10% by the share of the poorest 40%. Higher values indicate higher inequality.
+        description_key:
+          - "{definitions.disposable_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_palma_ratio}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: Palma ratio (disposable income)
+        display:
+          name: Palma ratio (disposable income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      s80_s20_ratio_disposable:
+        title: S80/S20 ratio (disposable income) - <<age>>
+        unit: ""
+        short_unit: ""
+        description_short: The share of income of the richest 20% divided by the share of the poorest 20%.
+        description_key:
+          - "{definitions.disposable_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_s80_s20_ratio}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: S80/S20 ratio (disposable income)
+        display:
+          name: S80/S20 ratio (disposable income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      headcount_ratio_disposable_50_median:
+        title: 50% of median - Share of population in poverty (disposable income) - <<age>>
+        unit: "%"
+        short_unit: "%"
+        description_short: Percentage of population living in households with an income per person below 50% of median.
+        description_key:
+          - "{definitions.disposable_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: 50% of median - Share of population in poverty (disposable income)
+        display:
+          name: 50% of median - Share of population in poverty (disposable income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      headcount_ratio_disposable_60_median:
+        title: 60% of median - Share of population in poverty (disposable income) - <<age>>
+        unit: "%"
+        short_unit: "%"
+        description_short: Percentage of population living in households with an income per person below 60% of median.
+        description_key:
+          - "{definitions.disposable_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: 60% of median - Share of population in poverty (disposable income)
+        display:
+          name: 60% of median - Share of population in poverty (disposable income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      headcount_ratio_market_50_median:
+        title: 50% of median - Share of population in poverty (market income) - <<age>>
+        unit: "%"
+        short_unit: "%"
+        description_short: Percentage of population living in households with an income per person below 50% of median.
+        description_key:
+          - "{definitions.market_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_headcount_ratio_market}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: 50% of median - Share of population in poverty (market income)
+        display:
+          name: 50% of median - Share of population in poverty (market income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      headcount_ratio_market_60_median:
+        title: 60% of median - Share of population in poverty (market income) - <<age>>
+        unit: "%"
+        short_unit: "%"
+        description_short: Percentage of population living in households with an income per person below 60% of median.
+        description_key:
+          - "{definitions.market_income}"
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_headcount_ratio_market}
+
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: 60% of median - Share of population in poverty (market income)
+        display:
+          name: 60% of median - Share of population in poverty (market income)
+          numDecimalPlaces: 1
+          <<: *common-display
+
+      gini_reduction:
+        title: Percentage reduction in Gini coefficient (before vs. after tax) - <<age>>
+        unit: "%"
+        short_unit: "%"
+        description_short: "This is the percentage difference between the Gini coefficient before taxes and benefits and the Gini coefficient after taxes and benefits."
+        description_key:
+          - "{definitions.equivalization}"
+          - "{definitions.additional_info}"
+          - "{definitions.covid}"
+        description_from_producer: |-
+          {definitions.source_income_definition_2012}
+
+          {definitions.source_recommended_uses_and_limitations}
+        presentation:
+          title_public: Percentage reduction in Gini coefficient
+        display:
+          name: Percentage reduction in Gini coefficient
+          numDecimalPlaces: 1
+          <<: *common-display
+
+
diff --git a/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py
new file mode 100644
index 00000000000..1fc4b9d7e34
--- /dev/null
+++ b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py
@@ -0,0 +1,211 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from typing import List
+
+import owid.catalog.processing as pr
+from owid.catalog import Table
+from structlog import get_logger
+from tabulate import tabulate
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Initialize logger.
+log = get_logger()
+
+# Define new names for categories
+INDICATOR_NAMES = {
+    "Gini (disposable income)": "gini_disposable",
+    "Gini (gross income)": "gini_gross",
+    "Gini (market income)": "gini_market",
+    "P50/P10 disposable income decile ratio": "p50_p10_ratio_disposable",
+    "P90/P10 disposable income decile ratio": "p90_p10_ratio_disposable",
+    "P90/P50 disposable income decile ratio": "p90_p50_ratio_disposable",
+    "Palma ratio (disposable income)": "palma_ratio_disposable",
+    "Quintile share ratio (disposable income)": "s80_s20_ratio_disposable",
+    "Poverty rate based on disposable income": "headcount_ratio_disposable",
+    "Poverty rate based on market income": "headcount_ratio_market",
+}
+
+POVERTY_LINES = {
+    "Not applicable": "not_applicable",
+    "50% of the national\xa0median disposable income": "50_median",
+    "60% of the national\xa0median disposable income": "60_median",
+}
+
+AGE_GROUPS = {"From 18 to 65 years": "Working population", "Over 65 years": "Over 65 years", "Total": "Total"}
+
+# Set table format when printing
+TABLEFMT = "pretty"
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("income_distribution_database")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["income_distribution_database"].reset_index()
+
+    #
+    # Process data.
+    tb = rename_and_create_columns(tb)
+
+    tb = create_relative_poverty_columns(tb)
+
+    tb = geo.harmonize_countries(
+        df=tb,
+        countries_file=paths.country_mapping_path,
+    )
+
+    sanity_checks(tb)
+
+    tb = tb.format(["country", "year", "age"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+
+def rename_and_create_columns(tb: Table) -> Table:
+    """
+    Rename categories in measure, poverty_line and age columns and make the table wide.
+    Also, add a gini_reduction column.
+    """
+    # Assert if all keys of dictionary are in the columns.
+    assert set(INDICATOR_NAMES.keys()) == set(tb["measure"]), "Not all expected categories are in the measure column"
+    assert set(POVERTY_LINES.keys()) == set(
+        tb["poverty_line"]
+    ), "Not all expected categories are in the poverty_line column"
+    assert set(AGE_GROUPS.keys()) == set(tb["age"]), "Not all expected categories are in the age column"
+
+    # Rename categories in measure, poverty_line and age columns.
+    tb["measure"] = tb["measure"].replace(INDICATOR_NAMES)
+    tb["poverty_line"] = tb["poverty_line"].replace(POVERTY_LINES)
+    tb["age"] = tb["age"].replace(AGE_GROUPS)
+
+    # Make the table wide, using measure as columns.
+    tb = tb.pivot(index=["country", "year", "poverty_line", "age"], columns="measure", values="value").reset_index()
+
+    # Create a variable that calculates the reduction from gini_market to gini_disposable
+    tb["gini_reduction"] = (tb["gini_market"] - tb["gini_disposable"]) / tb["gini_market"] * 100
+
+    return tb
+
+
+def create_relative_poverty_columns(tb: Table) -> Table:
+    """
+    Pivot table for headcount ratios and create multiple relative poverty columns with the poverty lines.
+    """
+
+    tb_inequality = tb.copy()
+    tb_poverty = tb.copy()
+
+    # Filter poverty_line column
+    tb_inequality = tb_inequality[tb_inequality["poverty_line"] == "not_applicable"].reset_index(drop=True)
+    tb_poverty = tb_poverty[tb_poverty["poverty_line"] != "not_applicable"].reset_index(drop=True)
+
+    # Define columns for both tables: tb_inequality has all the columns not containing headcount_ratio
+    # tb_poverty has all the columns containing headcount_ratio
+    inequality_columns = [c for c in tb_inequality.columns if "headcount_ratio" not in c]
+    poverty_columns = [c for c in tb_poverty.columns if "headcount_ratio" in c]
+
+    tb_inequality = tb_inequality[inequality_columns]
+    tb_poverty = tb_poverty[["country", "year", "poverty_line", "age"] + poverty_columns]
+
+    # Make tb_poverty wider
+    tb_poverty = tb_poverty.pivot(
+        index=["country", "year", "age"], columns="poverty_line", values=poverty_columns, join_column_levels_with="_"
+    ).reset_index(drop=True)
+
+    # Remove poverty_line column in tb_inequality
+    tb_inequality = tb_inequality.drop(columns=["poverty_line"], errors="raise")
+
+    # Merge both tables
+    tb = pr.merge(tb_inequality, tb_poverty, on=["country", "year", "age"])
+
+    return tb
+
+
+def sanity_checks(tb: Table) -> None:
+    """Run several sanity checks on the table."""
+
+    # Define headcount ratio columns
+    headcount_ratio_columns = [c for c in tb.columns if "headcount_ratio" in c]
+
+    # Divide headcount_ratio columns by 100
+    tb[headcount_ratio_columns] = tb[headcount_ratio_columns] / 100
+
+    check_between_0_and_1(
+        tb,
+        [
+            "gini_disposable",
+            "gini_gross",
+            "gini_market",
+        ]
+        + headcount_ratio_columns,
+    )
+
+    # Multiply headcount_ratio columns by 100
+    tb[headcount_ratio_columns] = tb[headcount_ratio_columns] * 100
+
+    check_negative_values(tb)
+
+    return None
+
+
+def check_between_0_and_1(tb: Table, variables: List[str]) -> None:
+    """
+    Check that indicators are between 0 and 1
+    """
+
+    tb = tb.copy()
+
+    for v in variables:
+        # Filter only values lower than 0 or higher than 1
+        mask = (tb[v] > 1) | (tb[v] < 0)
+        tb_error = tb[mask].copy().reset_index()
+
+        if not tb_error.empty:
+            log.fatal(
+                f"""Values for {v} are not between 0 and 1:
+                {tabulate(tb_error[['country', 'year', 'poverty_line', 'age', v]], headers = 'keys', tablefmt = TABLEFMT)}"""
+            )
+
+    return None
+
+
+def check_negative_values(tb: Table) -> None:
+    """
+    Check if there are negative values in the variables
+    """
+
+    tb = tb.copy()
+
+    # Define variables: all in the table, except for country, year and age
+    variables = [c for c in tb.columns if c not in ["country", "year", "age"]]
+
+    for v in variables:
+        # Create a mask to check if any value is negative
+        mask = tb[v] < 0
+        tb_error = tb[mask].reset_index(drop=True).copy()
+
+        if not tb_error.empty:
+            log.fatal(
+                f"""{len(tb_error)} observations for {v} are negative:
+                {tabulate(tb_error[['country', 'year', 'poverty_line', 'age', v]], headers = 'keys', tablefmt = TABLEFMT)}"""
+            )
+
+    return None
diff --git a/etl/steps/data/grapher/oecd/2024-04-10/income_distribution_database.py b/etl/steps/data/grapher/oecd/2024-04-10/income_distribution_database.py
new file mode 100644
index 00000000000..615b59ed24f
--- /dev/null
+++ b/etl/steps/data/grapher/oecd/2024-04-10/income_distribution_database.py
@@ -0,0 +1,28 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("income_distribution_database")
+
+    # Read table from garden dataset.
+    tb = ds_garden["income_distribution_database"]
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/oecd/2024-04-10/income_distribution_database.py b/etl/steps/data/meadow/oecd/2024-04-10/income_distribution_database.py
new file mode 100644
index 00000000000..78c76fa0ba9
--- /dev/null
+++ b/etl/steps/data/meadow/oecd/2024-04-10/income_distribution_database.py
@@ -0,0 +1,43 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Define variables to keep
+VARS_TO_KEEP = ["Reference area", "TIME_PERIOD", "Measure", "Poverty line", "Age", "OBS_VALUE"]
+
+# Define new names for columns
+INDICATOR_NAMES = {"Reference area": "country", "TIME_PERIOD": "year", "OBS_VALUE": "value"}
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("income_distribution_database.csv")
+
+    # Load data from snapshot.
+    tb = snap.read()
+
+    #
+    # Process data.
+    # Keep only the variables we are interested in.
+    tb = tb[VARS_TO_KEEP]
+
+    # Rename "Reference area" to "country", "TIME_PERIOD" to "year" and "OBS_VALUE" to "value".
+    tb = tb.rename(columns=INDICATOR_NAMES, errors="raise")
+
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb = tb.format(["country", "year", "measure", "poverty_line", "age"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/snapshots/oecd/2024-04-10/income_distribution_database.csv.dvc b/snapshots/oecd/2024-04-10/income_distribution_database.csv.dvc
new file mode 100644
index 00000000000..8eb00d4910e
--- /dev/null
+++ b/snapshots/oecd/2024-04-10/income_distribution_database.csv.dvc
@@ -0,0 +1,34 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: OECD Income Distribution Database (IDD)
+    description: |-
+      The OECD Income Distribution database (IDD) has been developed to benchmark and monitor countries’ performance in the field of income inequality and poverty. It contains a number of standardized indicators based on the central concept of “equivalized household disposable income”, i.e. the total income received by the households less the current taxes and transfers they pay, adjusted for household size with an equivalence scale. While household income is only one of the factors shaping people’s economic well-being, it is also the one for which comparable data for all OECD countries are most common. Income distribution has a long-standing tradition among household-level statistics, with regular data collections going back to the 1980s (and sometimes earlier) in many OECD countries.
+
+      Achieving comparability in this field is a challenge, as national practices differ widely in terms of concepts, measures, and statistical sources. In order to maximise international comparability as well as inter-temporal consistency of data, the IDD data collection and compilation process is based on a common set of statistical conventions (e.g. on income concepts and components). The information obtained by the OECD through a network of national data providers, via a standardized questionnaire, is based on national sources that are deemed to be most representative for each country.
+
+      Small changes in estimates between years should be treated with caution as they may not be statistically significant.
+    date_published: "2024-03-22"
+
+    # Citation
+    producer: OECD
+    citation_full: |-
+      OECD (2024). Income Distribution Database.
+    attribution_short: OECD
+
+    # Files
+    url_main: https://www.oecd.org/social/income-distribution-database.htm
+    url_download: https://sdmx.oecd.org/public/rest/data/OECD.WISE.INE,DSD_WISE_IDD@DF_IDD,1.0/.A.INC_GROSS_GINI+INC_MRKT_GINI+PR_INC_MRKT+D5_1_INC_DISP+D9_5_INC_DISP+D9_1_INC_DISP+PAL_INC_DISP+QR_INC_DISP+PR_INC_DISP+INC_DISP_GINI...Y_GT65+Y18T65+_T.METH2012.D_CUR.?dimensionAtObservation=AllDimensions&format=csvfilewithlabels
+    date_accessed: 2024-04-10
+
+    # License
+    license:
+      name: © OECD
+      url: https://www.oecd.org/termsandconditions/
+
+outs:
+  - md5: 9af9f06e82449979cddd07e7b96c4acf
+    size: 7519474
+    path: income_distribution_database.csv
diff --git a/snapshots/oecd/2024-04-10/income_distribution_database.py b/snapshots/oecd/2024-04-10/income_distribution_database.py
new file mode 100644
index 00000000000..ccec30cb3fe
--- /dev/null
+++ b/snapshots/oecd/2024-04-10/income_distribution_database.py
@@ -0,0 +1,60 @@
+"""
+Script to create a snapshot of the OECD Income Distribution Database.
+
+The download link should be working automatically, but if not, you can follow these steps
+
+STEPS TO OBTAIN THE DATA:
+    1. Go to the OECD Data Explorer: https://data-explorer.oecd.org/
+    2. In the section "Society", select "Inequality".
+    3. Select "Income distribution database".
+    4. Select these filters:
+        1. Time period: all years available. Select "----" as the start year and "----" as the end year.
+        2. Reference area: all countries available. By default, all countries are selected (check if it is not).
+        3. Measure: select
+            - Poverty rate based on disposable income
+            - Gini (disposable income)
+            - Quintile share ratio (disposable income)
+            - Palma ratio (disposable income)
+            - P90/P10 disposable income decile ratio
+            - P90/P50 disposable income decile ratio
+            - P50/P10 disposable income decile ratio
+            - Poverty rate based on market income
+            - Gini (market income)
+            - Gini (gross income)
+        4. Statistical operation: all the options available, which is the same as leaving them unselected.
+        5. Unit of measure: all the options available, which is the same as leaving them unselected.
+        6. Age: select "Total", "From 18 to 65 years" and "Over 65 years".
+        7. Methodology: select "Income definition since 2012".
+        8. Definition: select "Current definition"
+        9. Poverty line: all the options available, which is the same as leaving them unselected.
+    5. Click the Download button.
+    6. Right click on "Filtered data in tabular text (CSV)".
+    7. Select "Copy link address".
+    8. Paste the link in the `url_download` field in the income_distribution_database.csv.dvc file.
+    9. Run
+        python snapshots/oecd/{version}/income_distribution_database.py
+
+"""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"oecd/{SNAPSHOT_VERSION}/income_distribution_database.csv")
+
+    # Download data from source, add file to DVC and upload to S3.
+    snap.create_snapshot(upload=upload)
+
+
+if __name__ == "__main__":
+    main()

From cbe31fec468bbdcc98b8ea773108fab4efd226c9 Mon Sep 17 00:00:00 2001
From: Marigold <mojmir.vinkler@gmail.com>
Date: Thu, 11 Apr 2024 16:24:34 +0200
Subject: [PATCH 14/61] :bug: use helper read_sql function instead of
 pd.read_sql

---
 apps/staging_sync/cli.py | 4 ++--
 etl/grapher_model.py     | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/apps/staging_sync/cli.py b/apps/staging_sync/cli.py
index df577eb68d5..b10e0968b1a 100644
--- a/apps/staging_sync/cli.py
+++ b/apps/staging_sync/cli.py
@@ -18,7 +18,7 @@
 from etl import grapher_model as gm
 from etl.config import GRAPHER_USER_ID
 from etl.datadiff import _dict_diff
-from etl.db import Engine, get_engine
+from etl.db import Engine, get_engine, read_sql
 
 from .admin_api import AdminAPI
 
@@ -404,7 +404,7 @@ def _modified_chart_ids_by_admin(session: Session) -> Set[int]:
         select id from charts where publishedAt is not null
     )
     """
-    return set(pd.read_sql(q, session.bind).chartId.tolist())
+    return set(read_sql(q, session.bind).chartId.tolist())  # type: ignore
 
 
 def _get_git_branch_creation_date(branch_name: str) -> dt.datetime:
diff --git a/etl/grapher_model.py b/etl/grapher_model.py
index 733e27197f8..c6df22cebbf 100644
--- a/etl/grapher_model.py
+++ b/etl/grapher_model.py
@@ -51,6 +51,7 @@
 
 from etl import config, paths
 from etl.config import GRAPHER_USER_ID
+from etl.db import read_sql
 
 log = structlog.get_logger()
 
@@ -717,9 +718,9 @@ def load_sources(
         ) t
         order by t.id
         """
-        sources = pd.read_sql(
+        sources = read_sql(
             q,
-            session.bind,
+            session.bind,  # type: ignore
             params={
                 "datasetId": dataset_id,
                 # NOTE: query doesn't work with empty list so we use a dummy value
@@ -737,7 +738,7 @@ def load_sources(
             )
             sources.datasetId = sources.datasetId.fillna(dataset_id).astype(int)
 
-        return [cls(**d) for d in sources.to_dict(orient="records") if cls.validate(d)]
+        return [cls(**d) for d in sources.to_dict(orient="records") if cls.validate(d)]  # type: ignore
 
 
 class SuggestedChartRevisions(SQLModel, table=True):

From 12e1233daf5be8126845135c8b46e284385f9117 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Thu, 11 Apr 2024 16:25:25 +0200
Subject: [PATCH 15/61] :bug: cache load_dag in PathFinder only (#2518)

---
 etl/helpers.py        | 9 +++++++--
 etl/steps/__init__.py | 2 --
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/etl/helpers.py b/etl/helpers.py
index 74af9bff9b8..946a884f097 100644
--- a/etl/helpers.py
+++ b/etl/helpers.py
@@ -8,6 +8,7 @@
 import tempfile
 from collections.abc import Generator
 from contextlib import contextmanager
+from functools import cache
 from pathlib import Path
 from typing import Any, Dict, Iterable, Iterator, List, Literal, Optional, Union, cast
 from urllib.parse import urljoin
@@ -377,6 +378,10 @@ class WrongStepName(ExceptionFromDocstring):
     """Wrong step name. If this step was in the dag, it should be corrected."""
 
 
+# loading DAG can take up to 1 second, so cache it
+load_dag_cached = cache(load_dag)
+
+
 class PathFinder:
     """Helper object with naming conventions. It uses your module path (__file__) and
     extracts from it commonly used attributes like channel / namespace / version / short_name or
@@ -414,9 +419,9 @@ def dag(self):
         """Lazy loading of DAG."""
         if self._dag is None:
             if "/archive/" in str(self.f):
-                self._dag = load_dag(paths.DAG_ARCHIVE_FILE)
+                self._dag = load_dag_cached(paths.DAG_ARCHIVE_FILE)
             else:
-                self._dag = load_dag()
+                self._dag = load_dag_cached()
         return self._dag
 
     @property
diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py
index 7ef1f5f63b7..652cd42ac0a 100644
--- a/etl/steps/__init__.py
+++ b/etl/steps/__init__.py
@@ -14,7 +14,6 @@
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
-from functools import cache
 from glob import glob
 from importlib import import_module
 from pathlib import Path
@@ -131,7 +130,6 @@ def traverse(graph: Graph, nodes: Set[str]) -> Graph:
     return dict(reachable)
 
 
-@cache
 def load_dag(filename: Union[str, Path] = paths.DEFAULT_DAG_FILE) -> Dict[str, Any]:
     return _load_dag(filename, {})
 

From 52af71b2f6c52accc6728326ae693281be3013c0 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Thu, 11 Apr 2024 15:57:31 +0000
Subject: [PATCH 16/61] :robot: Metadata update by Admin

---
 etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml

diff --git a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml
new file mode 100644
index 00000000000..282a3c521ac
--- /dev/null
+++ b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml
@@ -0,0 +1,6 @@
+tables:
+  prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct:
+    variables:
+      ? |-
+        prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct__sex_both_sexes__age_group_18plus__years
+      : title: Share of adults who are overweight (age-standardized)

From 9b367c7b4d7e843827e07573649a625dba102dfc Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Thu, 11 Apr 2024 16:01:24 +0000
Subject: [PATCH 17/61] fasttrack: fasttrack/latest/gpei.csv

---
 dag/fasttrack.yml                             |  2 ++
 .../grapher/fasttrack/latest/gpei.meta.yml    | 33 +++++++++++++++++++
 .../data/grapher/fasttrack/latest/gpei.py     | 22 +++++++++++++
 snapshots/fasttrack/latest/gpei.csv.dvc       | 24 ++++++++++++++
 4 files changed, 81 insertions(+)
 create mode 100644 etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml
 create mode 100644 etl/steps/data/grapher/fasttrack/latest/gpei.py
 create mode 100644 snapshots/fasttrack/latest/gpei.csv.dvc

diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml
index 7b0596c8741..9ad5e3fc844 100644
--- a/dag/fasttrack.yml
+++ b/dag/fasttrack.yml
@@ -154,3 +154,5 @@ steps:
     - snapshot://fasttrack/latest/usa_weather_climate_noaa.csv
   data://grapher/fasttrack/latest/global_precipitation_anomaly_noaa:
     - snapshot://fasttrack/latest/global_precipitation_anomaly_noaa.csv
+  data://grapher/fasttrack/latest/gpei:
+    - snapshot://fasttrack/latest/gpei.csv
diff --git a/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml b/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml
new file mode 100644
index 00000000000..4c1574c2d36
--- /dev/null
+++ b/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml
@@ -0,0 +1,33 @@
+dataset:
+  title: Global Polio Eradication Initiative - cVDPV
+  description: ''
+  licenses:
+    - name: CC BY-NC-SA 3.0 IGO
+      url: https://polioeradication.org/terms-of-use/
+tables:
+  gpei:
+    variables:
+      cvdpv1:
+        title: circulating_vaccine_derived_polio_virus_1
+        unit: cases
+        display:
+          numDecimalPlaces: 0
+        description: Cases of circulating vaccine derived polio virus, type 1.
+      cvdpv2:
+        title: circulating_vaccine_derived_polio_virus_2
+        unit: cases
+        display:
+          numDecimalPlaces: 0
+        description: Cases of circulating vaccine derived polio virus, type 2.
+      cvdpv3:
+        title: circulating_vaccine_derived_polio_virus_3
+        unit: cases
+        display:
+          numDecimalPlaces: 0
+        description: Cases of circulating vaccine derived polio virus, type 3.
+      total_cvdpv:
+        title: circulating_vaccine_derived_polio_virus_total
+        unit: cases
+        display:
+          numDecimalPlaces: 0
+        description: Cases of circulating vaccine derived polio virus, all types.
diff --git a/etl/steps/data/grapher/fasttrack/latest/gpei.py b/etl/steps/data/grapher/fasttrack/latest/gpei.py
new file mode 100644
index 00000000000..0b9f8ef1fbf
--- /dev/null
+++ b/etl/steps/data/grapher/fasttrack/latest/gpei.py
@@ -0,0 +1,22 @@
+from etl.helpers import PathFinder, create_dataset, get_metadata_path
+from etl.snapshot import Snapshot
+
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    # load snapshot
+    snap = Snapshot("fasttrack/latest/gpei.csv")
+
+    # load data
+    tb = snap.read_csv()
+
+    # add table, update metadata from *.meta.yml and save
+    ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata)
+
+    # override metadata if necessary
+    meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml")
+    if meta_path.exists():
+        ds.update_metadata(meta_path)
+
+    ds.save()
diff --git a/snapshots/fasttrack/latest/gpei.csv.dvc b/snapshots/fasttrack/latest/gpei.csv.dvc
new file mode 100644
index 00000000000..15c64b743c4
--- /dev/null
+++ b/snapshots/fasttrack/latest/gpei.csv.dvc
@@ -0,0 +1,24 @@
+meta:
+  origin:
+    producer: Global Polio Eradication Initiative
+    title: Circulating Vaccine Derived Polio Virus
+    attribution: Global Polio Eradication Initiative
+    attribution_short: GPEI
+    version_producer: Google Sheet
+    url_main: |-
+      https://polioeradication.org/wp-content/uploads/2024/04/weekly-polio-analyses-cVDPV-20240402.pdf; https://polioeradication.org/wp-content/uploads/2022/04/weekly-polio-analyses-cVDPV-20220405.pdf
+    url_download: |-
+      https://docs.google.com/spreadsheets/d/e/2PACX-1vRxoU5EL03HvNZKmbDLCNDR8ZhOqd3C6guk9cVIX8uuUnXtMj2Do6mKUo4xhPO6q8KMw2At5ts05T4R/pub?output=csv
+    date_accessed: '2024-04-11'
+    license:
+      name: https://polioeradication.org/terms-of-use/
+      url: CC BY-NC-SA 3.0 IGO
+  name: Global Polio Eradication Initiative - cVDPV
+  description: ''
+  license:
+    name: CC BY-NC-SA 3.0 IGO
+    url: https://polioeradication.org/terms-of-use/
+outs:
+  - md5: 4631f6835f642e97800fce7c2250fdd0
+    size: 2840
+    path: gpei.csv

From 6b87ebf02ecfe4f27ec731fdc3bce995e7971fc1 Mon Sep 17 00:00:00 2001
From: Marigold <mojmir.vinkler@gmail.com>
Date: Thu, 11 Apr 2024 22:46:59 +0200
Subject: [PATCH 18/61] :bug: hotfix metadata

---
 etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml
index 282a3c521ac..d73471859f7 100644
--- a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml
+++ b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml
@@ -2,5 +2,5 @@ tables:
   prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct:
     variables:
       ? |-
-        prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct__sex_both_sexes__age_group_18plus__years
+        prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct
       : title: Share of adults who are overweight (age-standardized)

From 7bb3c2b862bcbe4b991da86cee5dd01598e791bf Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Fri, 12 Apr 2024 04:03:42 +0000
Subject: [PATCH 19/61] :robot: automatic wildfires update

---
 snapshots/climate/latest/weekly_wildfires.csv.dvc             | 4 ++--
 snapshots/excess_mortality/latest/hmd_stmf.csv.dvc            | 2 +-
 snapshots/excess_mortality/latest/wmd.csv.dvc                 | 2 +-
 snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc  | 2 +-
 .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc   | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
index 73b92d25225..80dfa615e25 100644
--- a/snapshots/climate/latest/weekly_wildfires.csv.dvc
+++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -9,8 +9,8 @@ meta:
     citation_full: Global Wildfire Information System
     attribution_short: GWIS
     url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
-    date_accessed: 2024-04-11
-    date_published: 2024-04-11
+    date_accessed: 2024-04-12
+    date_published: 2024-04-12
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
index b36f25da684..56571ce4d93 100644
--- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
+++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,7 +13,7 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-04-11
+    date_accessed: 2024-04-12
     publication_date: 2024-03-18
     publication_year: 2024
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
index 9b63532ccee..0e5f2f3fdfd 100644
--- a/snapshots/excess_mortality/latest/wmd.csv.dvc
+++ b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-04-11
+    date_accessed: 2024-04-12
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
index 581bd5f1206..8e76a8ebda8 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-04-11
+    date_accessed: 2024-04-12
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
index d52caeb9c77..fc1f5119059 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-04-11
+    date_accessed: 2024-04-12
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

From cb4aff2db5106201680ea9a4940fa2fe5bc42e6d Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Fri, 12 Apr 2024 04:05:37 +0000
Subject: [PATCH 20/61] :robot: automatic flunet update

---
 snapshots/who/latest/fluid.csv.dvc  | 4 ++--
 snapshots/who/latest/flunet.csv.dvc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
index a1752d6a8ea..9eae4761e07 100644
--- a/snapshots/who/latest/fluid.csv.dvc
+++ b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: aa9466d41960d48a594b05b7153b50f0
-    size: 150634788
+  - md5: 3cc56ba930e8b8c6c383b9f227d04b66
+    size: 150642152
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
index 10d86c6a41c..c2fd0d30b67 100644
--- a/snapshots/who/latest/flunet.csv.dvc
+++ b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 2f87106c852ce3bd068bf8f8e6771d7e
-    size: 25764329
+  - md5: 84662aad1845ce4641ffdf4d664af262
+    size: 25764774
     path: flunet.csv

From b1220e611ecdc28742e0b4077054847f7c023ac5 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Fri, 12 Apr 2024 10:04:22 +0200
Subject: [PATCH 21/61] :sparkles: improve datadiff posted by owidbot (#2507)

* :sparkles: improve datadiff posted by owidbot

* add include

* prune equal datasets and add info about staging servers

* Fix regex pattern in etldiff.py and handle exceptions in datadiff.py
---
 apps/owidbot/etldiff.py | 41 ++++++++++++++++++++++++++++++++++++++---
 etl/datadiff.py         |  2 +-
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py
index e72652b5fce..261feb56da0 100644
--- a/apps/owidbot/etldiff.py
+++ b/apps/owidbot/etldiff.py
@@ -1,4 +1,5 @@
 import datetime as dt
+import re
 import subprocess
 from typing import Tuple
 
@@ -9,6 +10,7 @@
 from rich.ansi import AnsiDecoder
 from rich_click.rich_command import RichCommand
 
+from apps.staging_sync.cli import _normalise_branch
 from etl import config
 from etl.paths import BASE_DIR
 
@@ -23,6 +25,12 @@
     "--branch",
     type=str,
 )
+@click.option(
+    "--include",
+    type=str,
+    default="garden",
+    help="Include datasets matching this regex.",
+)
 @click.option(
     "--dry-run/--no-dry-run",
     default=False,
@@ -31,6 +39,7 @@
 )
 def cli(
     branch: str,
+    include: str,
     dry_run: bool,
 ) -> None:
     """Post result of `etl diff` to Github PR.
@@ -41,12 +50,24 @@ def cli(
     $ python apps/owidbot/etldiff.py --branch my-branch
     ```
     """
-    lines = call_etl_diff()
+    lines = call_etl_diff(include)
     diff, result = format_etl_diff(lines)
 
+    nbranch = _normalise_branch(branch) if branch else "dry-run"
+
     body = f"""
 <details>
 
+<summary><b>Staging server</b>: </summary>
+
+- **Admin**: http://staging-site-{nbranch}/admin/login
+- **Site**: http://staging-site-{nbranch}/
+- **Login**: `ssh owid@staging-site-{nbranch}`
+- **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch}
+</details>
+
+<details>
+
 <summary><b>etl diff</b>: {result}</summary>
 
 ```diff
@@ -117,10 +138,24 @@ def format_etl_diff(lines: list[str]) -> Tuple[str, str]:
         new_lines.append(line)
 
     diff = "\n".join(new_lines)
+
+    # Some datasets might have different checksum, but be the same (this is caused by checksum_input and checksum_output
+    # problem). Hotfix this by removing matching datasets from the output.
+    # Example:
+    # = Dataset meadow/agriculture/2024-03-26/attainable_yields
+    #     = Table attainable_yields
+    # = Dataset garden/agriculture/2024-03-26/attainable_yields
+    #     = Table attainable_yields
+    #        ~ Column A
+    # = Dataset grapher/agriculture/2024-03-26/attainable_yields
+    #     = Table attainable_yields
+    pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)"
+    diff = re.sub(pattern, "", diff)
+
     return diff, result
 
 
-def call_etl_diff() -> list[str]:
+def call_etl_diff(include: str) -> list[str]:
     cmd = [
         "poetry",
         "run",
@@ -129,7 +164,7 @@ def call_etl_diff() -> list[str]:
         "REMOTE",
         "data/",
         "--include",
-        "garden",
+        include,
         "--exclude",
         EXCLUDE_DATASETS,
         "--verbose",
diff --git a/etl/datadiff.py b/etl/datadiff.py
index b7eaa5e97f9..9820496d40f 100644
--- a/etl/datadiff.py
+++ b/etl/datadiff.py
@@ -451,7 +451,7 @@ def _append_and_print(x):
                 continue
             except Exception as e:
                 # soft fail and continue with another dataset
-                log.error(e, exc_info=True)
+                log.error("\n".join(traceback.format_exception(type(e), e, e.__traceback__)))
                 any_error = True
                 continue
 

From ac3fa818832c2e3e1a0be922e6963e58ae1fedce Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Fri, 12 Apr 2024 08:15:02 +0000
Subject: [PATCH 22/61] fasttrack: fasttrack/latest/gpei.csv

---
 etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml | 8 ++++----
 snapshots/fasttrack/latest/gpei.csv.dvc               | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml b/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml
index 4c1574c2d36..8a8258e235b 100644
--- a/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml
+++ b/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml
@@ -8,25 +8,25 @@ tables:
   gpei:
     variables:
       cvdpv1:
-        title: circulating_vaccine_derived_polio_virus_1
+        title: cVDPV1
         unit: cases
         display:
           numDecimalPlaces: 0
         description: Cases of circulating vaccine derived polio virus, type 1.
       cvdpv2:
-        title: circulating_vaccine_derived_polio_virus_2
+        title: cVDPV2
         unit: cases
         display:
           numDecimalPlaces: 0
         description: Cases of circulating vaccine derived polio virus, type 2.
       cvdpv3:
-        title: circulating_vaccine_derived_polio_virus_3
+        title: CVDPV3
         unit: cases
         display:
           numDecimalPlaces: 0
         description: Cases of circulating vaccine derived polio virus, type 3.
       total_cvdpv:
-        title: circulating_vaccine_derived_polio_virus_total
+        title: cVDPV total
         unit: cases
         display:
           numDecimalPlaces: 0
diff --git a/snapshots/fasttrack/latest/gpei.csv.dvc b/snapshots/fasttrack/latest/gpei.csv.dvc
index 15c64b743c4..c042660f475 100644
--- a/snapshots/fasttrack/latest/gpei.csv.dvc
+++ b/snapshots/fasttrack/latest/gpei.csv.dvc
@@ -9,7 +9,7 @@ meta:
       https://polioeradication.org/wp-content/uploads/2024/04/weekly-polio-analyses-cVDPV-20240402.pdf; https://polioeradication.org/wp-content/uploads/2022/04/weekly-polio-analyses-cVDPV-20220405.pdf
     url_download: |-
       https://docs.google.com/spreadsheets/d/e/2PACX-1vRxoU5EL03HvNZKmbDLCNDR8ZhOqd3C6guk9cVIX8uuUnXtMj2Do6mKUo4xhPO6q8KMw2At5ts05T4R/pub?output=csv
-    date_accessed: '2024-04-11'
+    date_accessed: '2024-04-12'
     license:
       name: https://polioeradication.org/terms-of-use/
       url: CC BY-NC-SA 3.0 IGO

From 3087f5709b1348febbb6b205749f936956baf55a Mon Sep 17 00:00:00 2001
From: Marigold <mojmir.vinkler@gmail.com>
Date: Fri, 12 Apr 2024 11:17:38 +0200
Subject: [PATCH 23/61] :bug: handle indicators with dimensions correctly in
 ETL API

---
 api/v1/__init__.py                                          | 6 +++++-
 etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml | 5 ++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/api/v1/__init__.py b/api/v1/__init__.py
index a4443197f3b..a9877e90347 100644
--- a/api/v1/__init__.py
+++ b/api/v1/__init__.py
@@ -179,8 +179,12 @@ def _indicator_metadata_dict(indicator: Indicator, db_indicator: gm.Variable) ->
     indicator_update_dict = indicator.to_meta_dict()
     update_period_days = indicator_update_dict.pop("update_period_days", None)
 
+    # if indicator has dimensions, use its original name
+    original_short_name = (db_indicator.dimensions or {}).get("originalShortName")
+    short_name = original_short_name or db_indicator.shortName
+
     # create dictionary for metadata
-    meta_dict = {"tables": {db_indicator.table_name: {"variables": {db_indicator.shortName: indicator_update_dict}}}}
+    meta_dict = {"tables": {db_indicator.table_name: {"variables": {short_name: indicator_update_dict}}}}
 
     if update_period_days:
         meta_dict["dataset"] = {"update_period_days": update_period_days}
diff --git a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml
index d73471859f7..8e93bdb0080 100644
--- a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml
+++ b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml
@@ -1,6 +1,5 @@
 tables:
   prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct:
     variables:
-      ? |-
-        prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct
-      : title: Share of adults who are overweight (age-standardized)
+      prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct:
+        title: Share of adults who are overweight (age-standardized)

From 5d8bd5de9ebc71306ccea4280d8cd1db70d221a8 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Fri, 12 Apr 2024 09:59:19 +0000
Subject: [PATCH 24/61] fasttrack: fasttrack/latest/gpei.csv

---
 snapshots/fasttrack/latest/gpei.csv.dvc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/snapshots/fasttrack/latest/gpei.csv.dvc b/snapshots/fasttrack/latest/gpei.csv.dvc
index c042660f475..f64724cb0f7 100644
--- a/snapshots/fasttrack/latest/gpei.csv.dvc
+++ b/snapshots/fasttrack/latest/gpei.csv.dvc
@@ -19,6 +19,6 @@ meta:
     name: CC BY-NC-SA 3.0 IGO
     url: https://polioeradication.org/terms-of-use/
 outs:
-  - md5: 4631f6835f642e97800fce7c2250fdd0
-    size: 2840
+  - md5: df37f065fdc2bc947d26ec1926f898d3
+    size: 2966
     path: gpei.csv

From 5f899226bc07f7691fe0e426cb86e0fad109b1dd Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Sat, 13 Apr 2024 04:03:43 +0000
Subject: [PATCH 25/61] :robot: automatic wildfires update

---
 snapshots/climate/latest/weekly_wildfires.csv.dvc             | 4 ++--
 snapshots/excess_mortality/latest/hmd_stmf.csv.dvc            | 2 +-
 snapshots/excess_mortality/latest/wmd.csv.dvc                 | 2 +-
 snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc  | 2 +-
 .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc   | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
index 80dfa615e25..6eab1c6ccc6 100644
--- a/snapshots/climate/latest/weekly_wildfires.csv.dvc
+++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -9,8 +9,8 @@ meta:
     citation_full: Global Wildfire Information System
     attribution_short: GWIS
     url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
-    date_accessed: 2024-04-12
-    date_published: 2024-04-12
+    date_accessed: 2024-04-13
+    date_published: 2024-04-13
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
index 56571ce4d93..fa8aaf52d13 100644
--- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
+++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,7 +13,7 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-04-12
+    date_accessed: 2024-04-13
     publication_date: 2024-03-18
     publication_year: 2024
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
index 0e5f2f3fdfd..36018b29cb8 100644
--- a/snapshots/excess_mortality/latest/wmd.csv.dvc
+++ b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-04-12
+    date_accessed: 2024-04-13
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
index 8e76a8ebda8..0a792ecb64c 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-04-12
+    date_accessed: 2024-04-13
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
index fc1f5119059..e79268b4901 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-04-12
+    date_accessed: 2024-04-13
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

From b62d8e3ce2bae0352a4b67e382aeadb49c00d949 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Sat, 13 Apr 2024 04:04:57 +0000
Subject: [PATCH 26/61] :robot: automatic flunet update

---
 snapshots/who/latest/fluid.csv.dvc  | 4 ++--
 snapshots/who/latest/flunet.csv.dvc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
index 9eae4761e07..7b19b1ff4f0 100644
--- a/snapshots/who/latest/fluid.csv.dvc
+++ b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 3cc56ba930e8b8c6c383b9f227d04b66
-    size: 150642152
+  - md5: afdb4e941ee982a63ac479aad6869d2f
+    size: 150755257
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
index c2fd0d30b67..df5035ce3bf 100644
--- a/snapshots/who/latest/flunet.csv.dvc
+++ b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 84662aad1845ce4641ffdf4d664af262
-    size: 25764774
+  - md5: 5f7fb645f2cb0cf75314ab1f1473a106
+    size: 25784618
     path: flunet.csv

From 9415ceddb30517bdecf7b413bd6658f3997c46ac Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Sun, 14 Apr 2024 04:03:41 +0000
Subject: [PATCH 27/61] :robot: automatic wildfires update

---
 snapshots/climate/latest/weekly_wildfires.csv.dvc         | 8 ++++----
 snapshots/excess_mortality/latest/hmd_stmf.csv.dvc        | 2 +-
 snapshots/excess_mortality/latest/wmd.csv.dvc             | 6 +++---
 .../excess_mortality/latest/xm_karlinsky_kobak.csv.dvc    | 2 +-
 .../latest/xm_karlinsky_kobak_ages.csv.dvc                | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
index 6eab1c6ccc6..860918472f4 100644
--- a/snapshots/climate/latest/weekly_wildfires.csv.dvc
+++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -9,12 +9,12 @@ meta:
     citation_full: Global Wildfire Information System
     attribution_short: GWIS
     url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
-    date_accessed: 2024-04-13
-    date_published: 2024-04-13
+    date_accessed: 2024-04-14
+    date_published: 2024-04-14
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
 outs:
-  - md5: 1bc963ac2662d95647d5d69942a1d416
-    size: 11623135
+  - md5: 06757d4e2324d884c119b0a8c419e896
+    size: 11650883
     path: weekly_wildfires.csv
diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
index fa8aaf52d13..c1816336a5f 100644
--- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
+++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,7 +13,7 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-04-13
+    date_accessed: 2024-04-14
     publication_date: 2024-03-18
     publication_year: 2024
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
index 36018b29cb8..8b61273d172 100644
--- a/snapshots/excess_mortality/latest/wmd.csv.dvc
+++ b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-04-13
+    date_accessed: 2024-04-14
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
@@ -33,6 +33,6 @@ meta:
     name: MIT License
     url: https://github.com/akarlinsky/world_mortality/blob/main/LICENSE
 outs:
-  - md5: c835bfe0bcc56774ea33176cfe4d5238
-    size: 1020641
+  - md5: 7fd6ca328d57505575914722eac276f7
+    size: 1021704
     path: wmd.csv
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
index 0a792ecb64c..c1df71cfdc6 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-04-13
+    date_accessed: 2024-04-14
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
index e79268b4901..12d2e379c3b 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-04-13
+    date_accessed: 2024-04-14
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

From f26097d9a72c69e0a32ddd2f65317ce282998cd6 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Sun, 14 Apr 2024 04:04:47 +0000
Subject: [PATCH 28/61] :robot: automatic flunet update

---
 snapshots/who/latest/fluid.csv.dvc  | 4 ++--
 snapshots/who/latest/flunet.csv.dvc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
index 7b19b1ff4f0..461578b668e 100644
--- a/snapshots/who/latest/fluid.csv.dvc
+++ b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: afdb4e941ee982a63ac479aad6869d2f
-    size: 150755257
+  - md5: 70b53f02d9a64a282de4764cc9c2c897
+    size: 150812561
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
index df5035ce3bf..0ca209164bf 100644
--- a/snapshots/who/latest/flunet.csv.dvc
+++ b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 5f7fb645f2cb0cf75314ab1f1473a106
-    size: 25784618
+  - md5: badd68913d1b5328ffbb03213adb69bf
+    size: 25794436
     path: flunet.csv

From 0452f4111017261a519c761d63f93d6b3da8a9bd Mon Sep 17 00:00:00 2001
From: Pablo Arriagada <paarriagadap@gmail.com>
Date: Sun, 14 Apr 2024 23:38:13 -0400
Subject: [PATCH 29/61] :lipatick: make countries in Europe nan for years
 colonized

---
 .../data/garden/harvard/2023-09-18/colonial_dates_dataset.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py
index 632347f3ea9..fa61e722384 100644
--- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py
+++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py
@@ -330,7 +330,7 @@ def correct_european_countries(tb: Table) -> Table:
     european_countries = geo.list_countries_in_region(region="Europe")
 
     # If the country is in european_countries and last_colonizer is not "zzzz. Never colonized", assign nan to colonizer
-    for col in ["colonizer", "colonizer_grouped", "last_colonizer", "last_colonizer_grouped"]:
+    for col in ["colonizer", "colonizer_grouped", "last_colonizer", "years_colonized", "last_colonizer_grouped"]:
         tb[col] = tb[col].where(
             ~((tb["country"].isin(european_countries)) & (tb["last_colonizer_grouped"] == "zzzz. Never colonized")),
             np.nan,

From 41e15c8c6333763f273d9e7552b269a17155385b Mon Sep 17 00:00:00 2001
From: Pablo Arriagada <paarriagadap@gmail.com>
Date: Sun, 14 Apr 2024 23:40:35 -0400
Subject: [PATCH 30/61] :lipstick: change metadata names

---
 .../garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml
index 1f6e46694a1..550e22f18f0 100644
--- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml
+++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml
@@ -52,7 +52,7 @@ tables:
           conversionFactor: 1
 
       years_colonized:
-        title: Years the country has been colonized
+        title: Years a country was an European overseas colony
         unit: "years"
         short_unit: "years"
         description_short: |
@@ -64,7 +64,7 @@ tables:
         description_from_producer: ""
         processing_level: major
         display:
-          name: Years colonized
+          name: Years a country was an European overseas colony
           entityAnnotationsMap: ""
           numDecimalPlaces: 0
           conversionFactor: 1

From 6deb85392de403efc03c2a02ef999bc90c29ce81 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Mon, 15 Apr 2024 04:03:46 +0000
Subject: [PATCH 31/61] :robot: automatic wildfires update

---
 snapshots/climate/latest/weekly_wildfires.csv.dvc             | 4 ++--
 snapshots/excess_mortality/latest/hmd_stmf.csv.dvc            | 2 +-
 snapshots/excess_mortality/latest/wmd.csv.dvc                 | 2 +-
 snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc  | 4 ++--
 .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc   | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
index 860918472f4..1e9fbe61c30 100644
--- a/snapshots/climate/latest/weekly_wildfires.csv.dvc
+++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -9,8 +9,8 @@ meta:
     citation_full: Global Wildfire Information System
     attribution_short: GWIS
     url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
-    date_accessed: 2024-04-14
-    date_published: 2024-04-14
+    date_accessed: 2024-04-15
+    date_published: 2024-04-15
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
index c1816336a5f..e17ec668748 100644
--- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
+++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,7 +13,7 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-04-14
+    date_accessed: 2024-04-15
     publication_date: 2024-03-18
     publication_year: 2024
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
index 8b61273d172..cc60e04075b 100644
--- a/snapshots/excess_mortality/latest/wmd.csv.dvc
+++ b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-04-14
+    date_accessed: 2024-04-15
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
index c1df71cfdc6..3624b44c904 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-04-14
+    date_accessed: 2024-04-15
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
@@ -22,6 +22,6 @@ meta:
     url: https://github.com/dkobak/excess-mortality/blob/main/LICENSE
   access_notes: Contains data by age.
 outs:
-  - md5: 32b49a01586cf505a65340762fe80e44
+  - md5: 0550fc688c0264e1b59a24d2747cd209
     size: 381802
     path: xm_karlinsky_kobak.csv
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
index 12d2e379c3b..e5bf7ad7629 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-04-14
+    date_accessed: 2024-04-15
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

From ded92d067d98441759ca8b3109acd6fb1d222c2b Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Mon, 15 Apr 2024 04:05:32 +0000
Subject: [PATCH 32/61] :robot: automatic flunet update

---
 snapshots/who/latest/fluid.csv.dvc  | 2 +-
 snapshots/who/latest/flunet.csv.dvc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
index 461578b668e..b688b891b6e 100644
--- a/snapshots/who/latest/fluid.csv.dvc
+++ b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 70b53f02d9a64a282de4764cc9c2c897
+  - md5: 81d668993ca1dba5c2dd9feeb5b82218
     size: 150812561
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
index 0ca209164bf..059c1a79e02 100644
--- a/snapshots/who/latest/flunet.csv.dvc
+++ b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: badd68913d1b5328ffbb03213adb69bf
-    size: 25794436
+  - md5: c330a107ff283f6862d1775b81b2a3bf
+    size: 25796571
     path: flunet.csv

From db1055b9a8cc6efacfb7ca223496634ec1fd5755 Mon Sep 17 00:00:00 2001
From: Pablo Rosado <pabloarosado@gmail.com>
Date: Mon, 15 Apr 2024 09:45:28 +0200
Subject: [PATCH 33/61] =?UTF-8?q?=F0=9F=93=8A=20Update=20co2=20dataset=20r?=
 =?UTF-8?q?eference=20(#2524)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Duplicate owid_co2 step

* 📊 Update CO2 dataset (#2511)

* Adapt owid_co2 dataset

* Archive unused steps

* Make owid_co2 version latest
---
 dag/archive/emissions.yml                     |  25 ++
 dag/emissions.yml                             |  34 +-
 .../garden/emissions/latest/owid_co2.meta.yml |   9 +
 .../data/garden/emissions/latest/owid_co2.py  | 360 ++++++++++++++++++
 4 files changed, 404 insertions(+), 24 deletions(-)
 create mode 100644 etl/steps/data/garden/emissions/latest/owid_co2.meta.yml
 create mode 100644 etl/steps/data/garden/emissions/latest/owid_co2.py

diff --git a/dag/archive/emissions.yml b/dag/archive/emissions.yml
index d9495ba2a29..cf0b2e82571 100644
--- a/dag/archive/emissions.yml
+++ b/dag/archive/emissions.yml
@@ -138,3 +138,28 @@ steps:
     - data://garden/wb/2023-04-30/income_groups
   data://grapher/gcp/2023-12-05/global_carbon_budget:
     - data://garden/gcp/2023-12-05/global_carbon_budget
+  #
+  # Emissions - CO2 dataset (2023-12-12).
+  #
+  data://garden/emissions/2023-12-12/owid_co2:
+    - data://garden/emissions/2023-11-23/national_contributions
+    - data://garden/gcp/2023-12-12/global_carbon_budget
+    - data://garden/climate_watch/2023-10-31/emissions_by_sector
+    - data://garden/energy/2023-12-12/primary_energy_consumption
+    - data://garden/demography/2023-03-31/population
+    - data://garden/ggdc/2020-10-01/ggdc_maddison
+    - data://garden/regions/2023-01-01/regions
+  #
+  # Jones et al. (2023) - National contributions to climate change.
+  #
+  data://meadow/emissions/2023-11-23/national_contributions:
+    - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv
+    - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv
+    - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv
+  data://garden/emissions/2023-11-23/national_contributions:
+    - data://meadow/emissions/2023-11-23/national_contributions
+    - data://garden/regions/2023-01-01/regions
+    - data://garden/demography/2023-03-31/population
+    - data://garden/wb/2023-04-30/income_groups
+  data://grapher/emissions/2023-11-23/national_contributions:
+    - data://garden/emissions/2023-11-23/national_contributions
diff --git a/dag/emissions.yml b/dag/emissions.yml
index 5849d3f8642..0b9e9250739 100644
--- a/dag/emissions.yml
+++ b/dag/emissions.yml
@@ -45,17 +45,6 @@ steps:
   data://grapher/gcp/2023-12-12/global_carbon_budget:
     - data://garden/gcp/2023-12-12/global_carbon_budget
   #
-  # Emissions - CO2 dataset (2023-12-12).
-  #
-  data://garden/emissions/2023-12-12/owid_co2:
-    - data://garden/emissions/2023-11-23/national_contributions
-    - data://garden/gcp/2023-12-12/global_carbon_budget
-    - data://garden/climate_watch/2023-10-31/emissions_by_sector
-    - data://garden/energy/2023-12-12/primary_energy_consumption
-    - data://garden/demography/2023-03-31/population
-    - data://garden/ggdc/2020-10-01/ggdc_maddison
-    - data://garden/regions/2023-01-01/regions
-  #
   # RFF - World Carbon Pricing (2022-09-14).
   #
   data://meadow/rff/2023-10-19/world_carbon_pricing:
@@ -124,23 +113,20 @@ steps:
     - data://garden/regions/2023-01-01/regions
   data://grapher/emissions/2024-04-08/national_contributions:
     - data://garden/emissions/2024-04-08/national_contributions
-
-  ######################################################################################################################
-  # Older versions that should be archived once they are not used by any other steps.
   #
-  # Jones et al. (2023) - National contributions to climate change.
+  # Emissions - CO2 dataset.
   #
-  data://meadow/emissions/2023-11-23/national_contributions:
-    - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv
-    - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv
-    - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv
-  data://garden/emissions/2023-11-23/national_contributions:
-    - data://meadow/emissions/2023-11-23/national_contributions
+  data://garden/emissions/latest/owid_co2:
+    - data://garden/ggdc/2020-10-01/ggdc_maddison
+    - data://garden/energy/2023-12-12/primary_energy_consumption
+    - data://garden/emissions/2024-04-08/national_contributions
     - data://garden/regions/2023-01-01/regions
     - data://garden/demography/2023-03-31/population
-    - data://garden/wb/2023-04-30/income_groups
-  data://grapher/emissions/2023-11-23/national_contributions:
-    - data://garden/emissions/2023-11-23/national_contributions
+    - data://garden/climate_watch/2023-10-31/emissions_by_sector
+    - data://garden/gcp/2023-12-12/global_carbon_budget
+
+  ######################################################################################################################
+  # Older versions that should be archived once they are not used by any other steps.
 
   ######################################################################################################################
 
diff --git a/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml b/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml
new file mode 100644
index 00000000000..d58145ee7f5
--- /dev/null
+++ b/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml
@@ -0,0 +1,9 @@
+dataset:
+  title: OWID CO2 dataset
+  description: |
+    OWID CO2 dataset.
+
+    This dataset will be loaded by [the co2-data repository](https://github.com/owid/co2-data), to create a csv file of the dataset that can be downloaded in one click.
+
+# Dataset sources will be created in the step by combining all component datasets' sources.
+# Also, table metadata will be built from the tables' original metadata.
diff --git a/etl/steps/data/garden/emissions/latest/owid_co2.py b/etl/steps/data/garden/emissions/latest/owid_co2.py
new file mode 100644
index 00000000000..33dd14f123c
--- /dev/null
+++ b/etl/steps/data/garden/emissions/latest/owid_co2.py
@@ -0,0 +1,360 @@
+"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset.
+
+Datasets combined:
+* Global Carbon Budget - Global Carbon Project.
+* National contributions to climate change - Jones et al.
+* Greenhouse gas emissions by sector - Climate Watch.
+* Primary energy consumption - EI & EIA.
+
+Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2020) on
+GDP are included.
+
+"""
+
+
+import numpy as np
+from owid.catalog import Dataset, Origin, Table
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Conversion factor from tonnes to million tonnes.
+TONNES_TO_MILLION_TONNES = 1e-6
+
+# Select columns to use from each dataset, and how to rename them.
+GCP_COLUMNS = {
+    "country": "country",
+    "year": "year",
+    "emissions_total": "co2",
+    "emissions_total_per_capita": "co2_per_capita",
+    "traded_emissions": "trade_co2",
+    "emissions_from_cement": "cement_co2",
+    "emissions_from_cement_per_capita": "cement_co2_per_capita",
+    "emissions_from_coal": "coal_co2",
+    "emissions_from_coal_per_capita": "coal_co2_per_capita",
+    "emissions_from_flaring": "flaring_co2",
+    "emissions_from_flaring_per_capita": "flaring_co2_per_capita",
+    "emissions_from_gas": "gas_co2",
+    "emissions_from_gas_per_capita": "gas_co2_per_capita",
+    "emissions_from_oil": "oil_co2",
+    "emissions_from_oil_per_capita": "oil_co2_per_capita",
+    "emissions_from_other_industry": "other_industry_co2",
+    "emissions_from_other_industry_per_capita": "other_co2_per_capita",
+    "pct_growth_emissions_total": "co2_growth_prct",
+    "growth_emissions_total": "co2_growth_abs",
+    "emissions_total_per_gdp": "co2_per_gdp",
+    "emissions_total_per_unit_energy": "co2_per_unit_energy",
+    "consumption_emissions": "consumption_co2",
+    "consumption_emissions_per_capita": "consumption_co2_per_capita",
+    "consumption_emissions_per_gdp": "consumption_co2_per_gdp",
+    "cumulative_emissions_total": "cumulative_co2",
+    "cumulative_emissions_from_cement": "cumulative_cement_co2",
+    "cumulative_emissions_from_coal": "cumulative_coal_co2",
+    "cumulative_emissions_from_flaring": "cumulative_flaring_co2",
+    "cumulative_emissions_from_gas": "cumulative_gas_co2",
+    "cumulative_emissions_from_oil": "cumulative_oil_co2",
+    "cumulative_emissions_from_other_industry": "cumulative_other_co2",
+    "pct_traded_emissions": "trade_co2_share",
+    "emissions_total_as_share_of_global": "share_global_co2",
+    "emissions_from_cement_as_share_of_global": "share_global_cement_co2",
+    "emissions_from_coal_as_share_of_global": "share_global_coal_co2",
+    "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2",
+    "emissions_from_gas_as_share_of_global": "share_global_gas_co2",
+    "emissions_from_oil_as_share_of_global": "share_global_oil_co2",
+    "emissions_from_other_industry_as_share_of_global": "share_global_other_co2",
+    "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2",
+    "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2",
+    "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2",
+    "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2",
+    "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2",
+    "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2",
+    "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2",
+    # New variables, related to land-use change emissions.
+    "cumulative_emissions_from_land_use_change": "cumulative_luc_co2",
+    "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2",
+    "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc",
+    "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc",
+    "emissions_from_land_use_change": "land_use_change_co2",
+    "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2",
+    "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita",
+    "emissions_total_including_land_use_change": "co2_including_luc",
+    "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc",
+    "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita",
+    "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp",
+    "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy",
+    "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs",
+    "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct",
+}
+JONES_COLUMNS = {
+    "country": "country",
+    "year": "year",
+    "temperature_response_co2_total": "temperature_change_from_co2",
+    "temperature_response_ghg_total": "temperature_change_from_ghg",
+    "temperature_response_ch4_total": "temperature_change_from_ch4",
+    "temperature_response_n2o_total": "temperature_change_from_n2o",
+    "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg",
+}
+CLIMATE_WATCH_GHG_COLUMNS = {
+    "country": "country",
+    "year": "year",
+    "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf",
+    "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita",
+    "total_ghg_emissions_including_lucf": "total_ghg",
+    "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita",
+}
+CLIMATE_WATCH_CH4_COLUMNS = {
+    "country": "country",
+    "year": "year",
+    "total_ch4_emissions_including_lucf": "methane",
+    "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita",
+}
+CLIMATE_WATCH_N2O_COLUMNS = {
+    "country": "country",
+    "year": "year",
+    "total_n2o_emissions_including_lucf": "nitrous_oxide",
+    "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita",
+}
+PRIMARY_ENERGY_COLUMNS = {
+    "country": "country",
+    "year": "year",
+    "primary_energy_consumption__twh": "primary_energy_consumption",
+    "primary_energy_consumption_per_capita__kwh": "energy_per_capita",
+    "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp",
+}
+REGIONS_COLUMNS = {
+    "name": "country",
+    "iso_alpha3": "iso_code",
+}
+POPULATION_COLUMNS = {
+    "country": "country",
+    "year": "year",
+    "population": "population",
+}
+GDP_COLUMNS = {
+    "country": "country",
+    "year": "year",
+    "gdp": "gdp",
+}
+
+UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}}
+
+
+def convert_units(table: Table) -> Table:
+    """Convert units of table.
+
+    Parameters
+    ----------
+    table : Table
+        Data with its original units.
+
+    Returns
+    -------
+    Table
+        Data after converting units of specific columns.
+
+    """
+    table = table.copy()
+    # Check units and convert to more convenient ones.
+    for column in table.columns:
+        unit = table[column].metadata.unit
+        title = table[column].metadata.title
+        description_short = table[column].metadata.description or table[column].metadata.description_short
+        if unit in list(UNITS):
+            table[column] *= UNITS[unit]["conversion"]
+            table[column].metadata.unit = UNITS[unit]["new_unit"]
+            table[column].metadata.short_unit = UNITS[unit]["new_short_unit"]
+            table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"])
+            table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"])
+
+    return table
+
+
+def combine_tables(
+    tb_gcp: Table,
+    tb_jones: Table,
+    tb_climate_watch_ghg: Table,
+    tb_climate_watch_ch4: Table,
+    tb_climate_watch_n2o: Table,
+    tb_energy: Table,
+    tb_gdp: Table,
+    tb_population: Table,
+    tb_regions: Table,
+) -> Table:
+    """Combine tables.
+
+    Parameters
+    ----------
+    tb_gcp : Table
+        Global Carbon Budget table (from Global Carbon Project).
+    tb_jones : Table
+        National contributions to climate change (from Jones et al. (2023)).
+    tb_climate_watch_ghg : Table
+        Greenhouse gas emissions table (from Climate Watch).
+    tb_climate_watch_ch4 : Table
+        CH4 emissions table (from Climate Watch).
+    tb_climate_watch_n2o : Table
+        N2O emissions table (from Climate Watch).
+    tb_energy : Table
+        Primary energy consumption table (from BP & EIA).
+    tb_gdp : Table
+        Maddison GDP table (from GGDC).
+    tb_population : Table
+        OWID population table (from various sources).
+    tb_regions : Table
+        OWID regions table.
+
+    Returns
+    -------
+    combined : Table
+        Combined table with metadata and variables metadata.
+
+    """
+    # Combine main tables (with an outer join, to gather all entities from all tables).
+    combined = tb_gcp.copy()
+    for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]:
+        combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name)
+
+    # Add secondary tables (with a left join, to keep only entities for which we have emissions data).
+    for table in [tb_energy, tb_gdp, tb_population]:
+        combined = combined.merge(table, on=["country", "year"], how="left")
+
+    # Countries-regions dataset does not have a year column, so it has to be merged on country.
+    combined = combined.merge(tb_regions, on="country", how="left")
+
+    # Check that there were no repetition in column names.
+    error = "Repeated columns in combined data."
+    assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error
+
+    # Adjust units.
+    combined = convert_units(combined)
+
+    return combined
+
+
+def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table:
+    """Clean and prepare output table.
+
+    Parameters
+    ----------
+    combined : Table
+        Combined table.
+    ds_regions : Dataset
+        Regions dataset, only used to get its version.
+
+    Returns
+    -------
+    combined: Table
+        Cleaned combined table.
+
+    """
+    # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data).
+    columns_that_must_have_data = [
+        column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"]
+    ]
+    combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True)
+
+    # Add metadata to the ISO column (loaded from the regions dataset).
+    combined["iso_code"].m.origins = [
+        Origin(
+            producer="International Organization for Standardization",
+            title="Regions",
+            date_published=ds_regions.version,
+        )
+    ]
+    combined["iso_code"].metadata.title = "ISO code"
+    combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes."
+    combined["iso_code"].metadata.unit = ""
+
+    # Sanity check.
+    columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0]
+    assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}"
+
+    # Set index and sort conveniently.
+    combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index()
+
+    return combined
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load data.
+    #
+    # Load the global carbon budget dataset from the Global Carbon Project (GCP).
+    ds_gcp = paths.load_dataset("global_carbon_budget")
+
+    # Load the Jones et al. (2023) dataset on national contributions to climate change.
+    ds_jones = paths.load_dataset("national_contributions")
+
+    # Load the greenhouse gas emissions by sector dataset by Climate Watch.
+    ds_climate_watch = paths.load_dataset("emissions_by_sector")
+
+    # Load the GDP dataset by GGDC Maddison.
+    ds_gdp = paths.load_dataset("ggdc_maddison")
+
+    # Load primary energy consumption dataset (by different sources in our 'energy' namespace).
+    ds_energy = paths.load_dataset("primary_energy_consumption")
+
+    # Load population dataset.
+    ds_population = paths.load_dataset("population")
+
+    # Load countries-regions dataset (required to get ISO codes).
+    ds_regions = paths.load_dataset("regions")
+
+    # Gather all required tables from all datasets.
+    tb_gcp = ds_gcp["global_carbon_budget"]
+    tb_jones = ds_jones["national_contributions"]
+    tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"]
+    tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"]
+    tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"]
+    tb_energy = ds_energy["primary_energy_consumption"]
+    tb_gdp = ds_gdp["maddison_gdp"]
+    tb_population = ds_population["population"]
+    tb_regions = ds_regions["regions"]
+
+    #
+    # Process data.
+    #
+    # Choose required columns and rename them.
+    tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise")
+    tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise")
+    tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename(
+        columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise"
+    )
+    tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename(
+        columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise"
+    )
+    tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename(
+        columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise"
+    )
+    tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename(
+        columns=PRIMARY_ENERGY_COLUMNS, errors="raise"
+    )
+    tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise")
+    tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename(
+        columns=POPULATION_COLUMNS, errors="raise"
+    )
+    tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise")
+
+    # Combine tables.
+    combined = combine_tables(
+        tb_gcp=tb_gcp,
+        tb_jones=tb_jones,
+        tb_climate_watch_ghg=tb_climate_watch_ghg,
+        tb_climate_watch_ch4=tb_climate_watch_ch4,
+        tb_climate_watch_n2o=tb_climate_watch_n2o,
+        tb_energy=tb_energy,
+        tb_gdp=tb_gdp,
+        tb_population=tb_population,
+        tb_regions=tb_regions,
+    )
+
+    # Prepare outputs.
+    combined = prepare_outputs(combined=combined, ds_regions=ds_regions)
+
+    #
+    # Save outputs.
+    #
+    ds_garden = create_dataset(dest_dir, tables=[combined], check_variables_metadata=True)
+    ds_garden.save()

From 0337875934e66a61ef74d7deab3c5bbc75951732 Mon Sep 17 00:00:00 2001
From: Pablo Rosado <pabloarosado@gmail.com>
Date: Mon, 15 Apr 2024 10:33:17 +0200
Subject: [PATCH 34/61] =?UTF-8?q?=F0=9F=90=9B=20Fix=20faostat=20animals=20?=
 =?UTF-8?q?data=20(#2506)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix FAOSTAT aggregate of total meat

* Improve documentation
---
 .../garden/faostat/2024-03-14/faostat_qcl.py  | 66 ++++++++++++-------
 1 file changed, 43 insertions(+), 23 deletions(-)

diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py
index 3c64dab36c9..6e8e4687417 100644
--- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py
+++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py
@@ -1,6 +1,5 @@
 """FAOSTAT garden step for faostat_qcl dataset."""
 
-from pathlib import Path
 
 import numpy as np
 import owid.catalog.processing as pr
@@ -31,18 +30,39 @@
 ITEM_CODE_MEAT_POULTRY = "00001808"
 # Item code for 'Meat, chicken'.
 ITEM_CODE_MEAT_CHICKEN = "00001058"
-# List item codes to sum as part of "Meat, total" (avoiding double-counting items).
+# Given that the number of slaughtered animals to produce all meat is not provided, we estimate it by aggregating the
+# number of slaughtered animals for each meat item.
+# List item codes to sum as part of "Meat, Total" (avoiding double-counting items).
+# This list can be found following these steps:
+# * Go to: https://www.fao.org/faostat/en/#definitions
+# * Click on "Item Group" (on the left column).
+# * Type "1765" in the search bar (which is the Item Group Code corresponding to Item Group "Meat, Total").
+# * Download the output of the search as a CSV file.
+# * Open the file, filter by Item Group Code "1765" and Domain Code "QCL".
+# * The list of item codes and item names are in columns "Item Code" and "Item".
 MEAT_TOTAL_ITEM_CODES = [
-    "00000977",  # 'Meat, lamb and mutton' (previously 'Meat, lamb and mutton')
-    "00001035",  # 'Meat of pig with the bone, fresh or chilled' (previously 'Meat, pig')
-    "00001097",  # 'Horse meat, fresh or chilled' (previously 'Meat, horse')
-    "00001108",  # 'Meat of asses, fresh or chilled' (previously 'Meat, ass')
-    "00001111",  # 'Meat of mules, fresh or chilled' (previously 'Meat, mule')
-    "00001127",  # 'Meat of camels, fresh or chilled' (previously 'Meat, camel')
-    "00001141",  # 'Meat of rabbits and hares, fresh or chilled' (previously 'Meat, rabbit')
-    "00001806",  # 'Meat, beef and buffalo' (previously 'Meat, beef and buffalo')
-    "00001807",  # 'Meat, sheep and goat' (previously 'Meat, sheep and goat')
-    ITEM_CODE_MEAT_POULTRY,  # 'Meat, poultry' (previously 'Meat, poultry')
+    "00001058",  # 'Meat of chickens, fresh or chilled',
+    "00001069",  # 'Meat of ducks, fresh or chilled',
+    "00001035",  # 'Meat of pig with the bone, fresh or chilled',
+    "00001017",  # 'Meat of goat, fresh or chilled',
+    "00000977",  # 'Meat of sheep, fresh or chilled',
+    "00000867",  # 'Meat of cattle with the bone, fresh or chilled',
+    "00000947",  # 'Meat of buffalo, fresh or chilled',
+    "00001127",  # 'Meat of camels, fresh or chilled',
+    "00001097",  # 'Horse meat, fresh or chilled',
+    "00001080",  # 'Meat of turkeys, fresh or chilled',
+    "00001141",  # 'Meat of rabbits and hares, fresh or chilled',
+    "00001163",  # 'Game meat, fresh, chilled or frozen',
+    "00001108",  # 'Meat of asses, fresh or chilled',
+    "00001073",  # 'Meat of geese, fresh or chilled',
+    "00001111",  # 'Meat of mules, fresh or chilled',
+    "00001166",  # 'Other meat n.e.c. (excluding mammals), fresh, chilled or frozen',
+    "00001158",  # 'Meat of other domestic camelids, fresh or chilled',
+    "00001151",  # 'Meat of other domestic rodents, fresh or chilled',
+    "00001089",  # 'Meat of pigeons and other birds n.e.c., fresh, chilled or frozen',
+    "00001176",  # 'Snails, fresh, chilled, frozen, dried, salted or in brine, except sea snails',
+    # Items that were in the list of "Meat, Total", but were not in the data:
+    # "00001083",  # 'Other birds',
 ]
 
 # List of element codes for "Producing or slaughtered animals" (they have different items assigned).
@@ -163,17 +183,10 @@ def add_slaughtered_animals_to_meat_total(tb: Table) -> Table:
 
     error = f"Some items required to get the aggregate '{TOTAL_MEAT_ITEM}' are missing in data."
     assert set(MEAT_TOTAL_ITEM_CODES) < set(tb["item_code"]), error
-    assert SLAUGHTERED_ANIMALS_ELEMENT in tb["element"].unique()
-    assert SLAUGHTERED_ANIMALS_UNIT in tb["unit"].unique()
+    assert SLAUGHTERED_ANIMALS_ELEMENT in set(tb["element"])
+    assert SLAUGHTERED_ANIMALS_UNIT in set(tb["unit"])
 
-    # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data.
-    assert tb[
-        (tb["item"] == TOTAL_MEAT_ITEM)
-        & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT)
-        & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT)
-    ].empty
-
-    # There are two element codes for the same element (they have different items assigned).
+    # Check that there are two element codes for the same element (they have different items assigned).
     error = "Element codes for 'Producing or slaughtered animals' may have changed."
     assert (
         tb[(tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) & ~(tb["element_code"].str.contains("pc"))]["element_code"]
@@ -182,6 +195,13 @@ def add_slaughtered_animals_to_meat_total(tb: Table) -> Table:
         == SLAUGHTERED_ANIMALS_ELEMENT_CODES
     ), error
 
+    # Check that they use the same unit.
+    error = "Unit for element 'Producing or slaughtered animals' may have changed."
+    assert set(tb[(tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT)]["unit"]) == set(["animals"]), error
+
+    # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data.
+    assert tb[(tb["item"] == TOTAL_MEAT_ITEM) & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT)].empty
+
     # Check that the items assigned to each the two element codes do not overlap.
     error = "Element codes for 'Producing or slaughtered animals' have overlapping items."
     items_for_different_elements = (
@@ -437,7 +457,7 @@ def run(dest_dir: str) -> None:
     # Load data.
     #
     # Fetch the dataset short name from dest_dir.
-    dataset_short_name = Path(dest_dir).name
+    dataset_short_name = f"{NAMESPACE}_qcl"
 
     # Define path to current step file.
     current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py")

From f75d846421199725199ff33fc9092d0c42cf1de2 Mon Sep 17 00:00:00 2001
From: veronikasamborska1994
 <32176660+veronikasamborska1994@users.noreply.github.com>
Date: Mon, 15 Apr 2024 09:43:26 +0100
Subject: [PATCH 35/61] Copernicus April update (#2523)

---
 dag/climate.yml                               |  2 +-
 .../2024-04-12/surface_temperature.gz.dvc     | 27 +++++++++
 .../climate/2024-04-12/surface_temperature.py | 60 +++++++++++++++++++
 3 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 snapshots/climate/2024-04-12/surface_temperature.gz.dvc
 create mode 100644 snapshots/climate/2024-04-12/surface_temperature.py

diff --git a/dag/climate.yml b/dag/climate.yml
index 0dcb469b54a..fd50b88cbeb 100644
--- a/dag/climate.yml
+++ b/dag/climate.yml
@@ -39,7 +39,7 @@ steps:
   # Copernicus Climate Change Service - Surface temperature.
   #
   data://meadow/climate/2023-12-20/surface_temperature:
-  - snapshot://climate/2024-03-12/surface_temperature.gz
+  - snapshot://climate/2024-04-12/surface_temperature.gz
   - snapshot://countries/2023-12-27/world_bank.zip
   data://garden/climate/2023-12-20/surface_temperature:
   - data://meadow/climate/2023-12-20/surface_temperature
diff --git a/snapshots/climate/2024-04-12/surface_temperature.gz.dvc b/snapshots/climate/2024-04-12/surface_temperature.gz.dvc
new file mode 100644
index 00000000000..15306cfe5dc
--- /dev/null
+++ b/snapshots/climate/2024-04-12/surface_temperature.gz.dvc
@@ -0,0 +1,27 @@
+meta:
+  origin:
+    title_snapshot: ERA5 Monthly Averaged Data on Single Levels from 1940 to Present - Monthly Averages of 2m Surface Temperature
+    title: ERA5 monthly averaged data on single levels from 1940 to present
+    description: |-
+      ERA5 is the latest climate reanalysis produced by ECMWF, providing hourly data on many atmospheric, land-surface and sea-state parameters together with estimates of uncertainty.
+
+      ERA5 data are available in the Climate Data Store on regular latitude-longitude grids at 0.25° x 0.25° resolution, with atmospheric parameters on 37 pressure levels.
+
+      ERA5 is available from 1940 and continues to be extended forward in time, with daily updates being made available 5 days behind real time
+
+      Initial release data, i.e., data no more than three months behind real time, are called ERA5T.
+    producer: Copernicus Climate Change Service
+    version_producer: 2
+    citation_full: |-
+      Hersbach, H., Bell, B., Berrisford, P., Biavati, G., Horányi, A., Muñoz Sabater, J., Nicolas, J., Peubey, C., Radu, R., Rozum, I., Schepers, D., Simmons, A., Soci, C., Dee, D., Thépaut, J-N. (2023): ERA5 monthly averaged data on single levels from 1940 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.f17050d7 (Accessed on 13-Feb-2024)
+    url_main: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview
+    date_accessed: 2024-04-12
+    date_published: 2019-04-18
+    license:
+      name: Copernicus License
+      url: https://cds.climate.copernicus.eu/api/v2/terms/static/licence-to-use-copernicus-products.pdf
+
+outs:
+  - md5: 715bc2845ee71aaf0dca48dfec2f2c58
+    size: 1702330442
+    path: surface_temperature.gz
diff --git a/snapshots/climate/2024-04-12/surface_temperature.py b/snapshots/climate/2024-04-12/surface_temperature.py
new file mode 100644
index 00000000000..3cc66fcba72
--- /dev/null
+++ b/snapshots/climate/2024-04-12/surface_temperature.py
@@ -0,0 +1,60 @@
+"""Script to create a snapshot of the monthly averaged surface temperature data from 1950 to present from the Copernicus Climate Change Service.
+
+   The script assumes that the data is available on the CDS API.
+   Instructions on how to access the API on a Mac are here: https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+macOS
+
+   More information on how to access the data is here: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview
+
+   The data is downloaded as a NetCDF file. Tutorials for using the Copernicus API are here and work with the NETCDF format are here: https://ecmwf-projects.github.io/copernicus-training-c3s/cds-tutorial.html
+   """
+
+import gzip
+import shutil
+import tempfile
+from pathlib import Path
+
+# CDS API
+import cdsapi
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/surface_temperature.gz")
+    # Save data as a compressed temporary file.
+    with tempfile.TemporaryDirectory() as temp_dir:
+        c = cdsapi.Client()
+        output_file = Path(temp_dir) / "era5_monthly_t2m_eur.nc"
+
+        c.retrieve(
+            "reanalysis-era5-single-levels-monthly-means",
+            {
+                "product_type": "monthly_averaged_reanalysis",
+                "variable": "2m_temperature",
+                "year": [str(year) for year in range(1940, 2025)],
+                "month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
+                "time": "00:00",
+                "area": [90, -180, -90, 180],
+                "format": "netcdf",
+            },
+            output_file,
+        )
+        # Compress the file
+        with open(output_file, "rb") as f_in:
+            with gzip.open(str(output_file) + ".gz", "wb") as f_out:
+                shutil.copyfileobj(f_in, f_out)
+
+        gzip_file = str(output_file) + ".gz"
+        # Upload snapshot.
+        snap.create_snapshot(filename=gzip_file, upload=upload)
+
+
+if __name__ == "__main__":
+    main()

From 12f2c35f8a23567bcafdb63ed1bc912f6899b83c Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Mon, 15 Apr 2024 10:04:57 +0000
Subject: [PATCH 36/61] fasttrack:
 fasttrack/latest/agricultural_policies_wuepper.csv

---
 .../fasttrack/latest/agricultural_policies_wuepper.csv.dvc  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/snapshots/fasttrack/latest/agricultural_policies_wuepper.csv.dvc b/snapshots/fasttrack/latest/agricultural_policies_wuepper.csv.dvc
index 07170a872be..1def06c0978 100644
--- a/snapshots/fasttrack/latest/agricultural_policies_wuepper.csv.dvc
+++ b/snapshots/fasttrack/latest/agricultural_policies_wuepper.csv.dvc
@@ -8,11 +8,11 @@ meta:
     url_main: https://www.nature.com/articles/s43016-024-00945-8
     url_download: |-
       https://docs.google.com/spreadsheets/d/e/2PACX-1vTxYlN0qXIUUXEok-T_1QmKXw-9eGRG1cQD2EKQSrC8kpWiI_C_f0oJFKU3SuOksmPKjxIS3UQwpU8l/pub?output=csv
-    date_accessed: '2024-03-26'
+    date_accessed: '2024-04-15'
   name: Agricultural policies (Wuepper et al. 2024)
   description: The number and stringency of agricultural-environmental policies by country.
   license: {}
 outs:
-  - md5: 08b93261bfecce404931598400f219f5
-    size: 10062
+  - md5: 616cb47603ee7f243557db1dd54b02e1
+    size: 10089
     path: agricultural_policies_wuepper.csv

From c54f84b339c74196f9c2c704654b9cd6ad9e0535 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Mon, 15 Apr 2024 11:50:55 +0000
Subject: [PATCH 37/61] fasttrack:
 fasttrack/latest/conflict_deaths_combined.csv

---
 dag/fasttrack.yml                             |  2 ++
 .../latest/conflict_deaths_combined.meta.yml  | 23 +++++++++++++++++++
 .../latest/conflict_deaths_combined.py        | 22 ++++++++++++++++++
 .../latest/conflict_deaths_combined.csv.dvc   | 14 +++++++++++
 4 files changed, 61 insertions(+)
 create mode 100644 etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.meta.yml
 create mode 100644 etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.py
 create mode 100644 snapshots/fasttrack/latest/conflict_deaths_combined.csv.dvc

diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml
index 9ad5e3fc844..8912820bea8 100644
--- a/dag/fasttrack.yml
+++ b/dag/fasttrack.yml
@@ -156,3 +156,5 @@ steps:
     - snapshot://fasttrack/latest/global_precipitation_anomaly_noaa.csv
   data://grapher/fasttrack/latest/gpei:
     - snapshot://fasttrack/latest/gpei.csv
+  data-private://grapher/fasttrack/latest/conflict_deaths_combined:
+    - snapshot-private://fasttrack/latest/conflict_deaths_combined.csv
diff --git a/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.meta.yml b/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.meta.yml
new file mode 100644
index 00000000000..72a28530d02
--- /dev/null
+++ b/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.meta.yml
@@ -0,0 +1,23 @@
+dataset:
+  title: DRAFT conflict_deaths_combined
+  description: ''
+  licenses:
+    - {}
+tables:
+  conflict_deaths_combined:
+    variables:
+      onesided_deaths:
+        title: onesided_deaths
+        unit: ''
+      nonstate_deaths:
+        title: nonstate_deaths
+        unit: ''
+      intrastate_deaths:
+        title: intrastate_deaths
+        unit: ''
+      interstate_deaths:
+        title: interstate_deaths
+        unit: ''
+      all_deaths:
+        title: all_deaths
+        unit: ''
diff --git a/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.py b/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.py
new file mode 100644
index 00000000000..f8475e892c6
--- /dev/null
+++ b/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.py
@@ -0,0 +1,22 @@
+from etl.helpers import PathFinder, create_dataset, get_metadata_path
+from etl.snapshot import Snapshot
+
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    # load snapshot
+    snap = Snapshot("fasttrack/latest/conflict_deaths_combined.csv")
+
+    # load data
+    tb = snap.read_csv()
+
+    # add table, update metadata from *.meta.yml and save
+    ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata)
+
+    # override metadata if necessary
+    meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml")
+    if meta_path.exists():
+        ds.update_metadata(meta_path)
+
+    ds.save()
diff --git a/snapshots/fasttrack/latest/conflict_deaths_combined.csv.dvc b/snapshots/fasttrack/latest/conflict_deaths_combined.csv.dvc
new file mode 100644
index 00000000000..821eb2d7286
--- /dev/null
+++ b/snapshots/fasttrack/latest/conflict_deaths_combined.csv.dvc
@@ -0,0 +1,14 @@
+meta:
+  source:
+    name: Unknown
+    url: ''
+    source_data_url: gAAAAABmHRSaP_x2DAk2mJbIu3t0K1qg0C5hYhRTgMa6gF8O32w49j8YI13vwsllJ0cbUDM38yH9wMiXjVID6LWrSb8QgiKpiOO6c-JM5uc-4ZnKFV_oFR0=
+    date_accessed: '2024-04-15'
+  name: DRAFT conflict_deaths_combined
+  description: ''
+  license: {}
+  is_public: false
+outs:
+  - md5: 76da540ac83b8df947ffa30f7655b9cb
+    size: 377
+    path: conflict_deaths_combined.csv

From f99f4cffc4958240c4cea895bb5b8923bff72f2a Mon Sep 17 00:00:00 2001
From: Marigold <mojmir.vinkler@gmail.com>
Date: Mon, 15 Apr 2024 15:35:50 +0200
Subject: [PATCH 38/61] :hammer: exclude country_profile from datadiff

---
 apps/owidbot/etldiff.py                                        | 2 +-
 .../unep/2023-03-17/consumption_controlled_substances.meta.yml | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py
index 261feb56da0..a721b43bd0f 100644
--- a/apps/owidbot/etldiff.py
+++ b/apps/owidbot/etldiff.py
@@ -17,7 +17,7 @@
 log = structlog.get_logger()
 
 
-EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet"
+EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet|country_profile"
 
 
 @click.command(name="owidbot-etl-diff", cls=RichCommand, help=__doc__)
diff --git a/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml b/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml
index d7c3699b9a3..919ca832bd0 100644
--- a/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml
+++ b/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml
@@ -17,9 +17,6 @@ dataset:
 
     Negative values for a given year imply that quantities destroyed or quantities exported for the year exceeded the sum of production and imports, implying that the destroyed or exported quantities came from stockpiles.
 
-  licenses:
-    - name: # Example: Testing License Name
-      url: # Example: https://url_of_testing_source.com/license
   sources:
     - *source-testing
 

From 3a5594d620cbe3142efc1a2d066b863f18dcc468 Mon Sep 17 00:00:00 2001
From: Pablo Arriagada <63430031+paarriagadap@users.noreply.github.com>
Date: Mon, 15 Apr 2024 13:57:45 -0400
Subject: [PATCH 39/61] =?UTF-8?q?=F0=9F=93=8A=20wb:=20Update=20World=20Ban?=
 =?UTF-8?q?k=20PIP=20to=20version=2020240326=20(#2464)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update World Bank PIP

* :bug: fix index order of the check

* :bug: create inc and cons spells without losing data

* :bug: fix issue with Poland income and consumption data

* :lipstick: add comments and description_processing about Poland

* :bug: fix text about Poland
---
 dag/poverty_inequality.yml                    |   22 +-
 etl/steps/data/garden/wb/2024-03-27/shared.py |  845 +++++++++
 .../2024-03-27/world_bank_pip.countries.json  |  181 ++
 .../wb/2024-03-27/world_bank_pip.meta.yml     |  473 +++++
 .../garden/wb/2024-03-27/world_bank_pip.py    | 1203 +++++++++++++
 .../world_bank_pip_2011ppp.meta.yml           |    4 +
 .../wb/2024-03-27/world_bank_pip_2011ppp.py   |   34 +
 .../world_bank_pip_2017ppp.meta.yml           |    4 +
 .../wb/2024-03-27/world_bank_pip_2017ppp.py   |   34 +
 .../meadow/wb/2024-03-27/world_bank_pip.py    |   51 +
 snapshots/wb/2024-03-27/pip_api.py            | 1573 +++++++++++++++++
 .../wb/2024-03-27/world_bank_pip.csv.dvc      |   31 +
 snapshots/wb/2024-03-27/world_bank_pip.py     |   36 +
 .../world_bank_pip_percentiles.csv.dvc        |   33 +
 .../2024-03-27/world_bank_pip_percentiles.py  |   25 +
 15 files changed, 4538 insertions(+), 11 deletions(-)
 create mode 100644 etl/steps/data/garden/wb/2024-03-27/shared.py
 create mode 100644 etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
 create mode 100644 etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml
 create mode 100644 etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
 create mode 100644 etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml
 create mode 100644 etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py
 create mode 100644 etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml
 create mode 100644 etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py
 create mode 100644 etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py
 create mode 100644 snapshots/wb/2024-03-27/pip_api.py
 create mode 100644 snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
 create mode 100644 snapshots/wb/2024-03-27/world_bank_pip.py
 create mode 100644 snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
 create mode 100644 snapshots/wb/2024-03-27/world_bank_pip_percentiles.py

diff --git a/dag/poverty_inequality.yml b/dag/poverty_inequality.yml
index d44b0e1f3b0..d156dd48257 100644
--- a/dag/poverty_inequality.yml
+++ b/dag/poverty_inequality.yml
@@ -7,24 +7,24 @@ steps:
 
   # Poverty and inequality file for Joe's PhD
   data://explorers/poverty_inequality/latest/poverty_inequality_export:
-    - data://garden/wb/2024-01-17/world_bank_pip
+    - data://garden/wb/2024-03-27/world_bank_pip
     - data://garden/wid/2023-08-24/world_inequality_database
     - data://garden/lis/2023-08-30/luxembourg_income_study
     - data://garden/wb/2024-01-22/thousand_bins_distribution
     - data://garden/worldbank_wdi/2023-05-29/wdi
 
   # World Bank Poverty and Inequality Platform
-  data://meadow/wb/2024-01-17/world_bank_pip:
-    - snapshot://wb/2024-01-17/world_bank_pip.csv
-    - snapshot://wb/2024-01-17/world_bank_pip_percentiles.csv
-  data://garden/wb/2024-01-17/world_bank_pip:
-    - data://meadow/wb/2024-01-17/world_bank_pip
-  data://grapher/wb/2024-01-17/world_bank_pip_2011ppp:
-    - data://garden/wb/2024-01-17/world_bank_pip
-  data://grapher/wb/2024-01-17/world_bank_pip_2017ppp:
-    - data://garden/wb/2024-01-17/world_bank_pip
+  data://meadow/wb/2024-03-27/world_bank_pip:
+    - snapshot://wb/2024-03-27/world_bank_pip.csv
+    - snapshot://wb/2024-03-27/world_bank_pip_percentiles.csv
+  data://garden/wb/2024-03-27/world_bank_pip:
+    - data://meadow/wb/2024-03-27/world_bank_pip
+  data://grapher/wb/2024-03-27/world_bank_pip_2011ppp:
+    - data://garden/wb/2024-03-27/world_bank_pip
+  data://grapher/wb/2024-03-27/world_bank_pip_2017ppp:
+    - data://garden/wb/2024-03-27/world_bank_pip
   data://explorers/wb/latest/world_bank_pip:
-    - data://garden/wb/2024-01-17/world_bank_pip
+    - data://garden/wb/2024-03-27/world_bank_pip
 
   # World Inequality Database
   data://meadow/wid/2023-08-24/world_inequality_database:
diff --git a/etl/steps/data/garden/wb/2024-03-27/shared.py b/etl/steps/data/garden/wb/2024-03-27/shared.py
new file mode 100644
index 00000000000..4bf8096f67d
--- /dev/null
+++ b/etl/steps/data/garden/wb/2024-03-27/shared.py
@@ -0,0 +1,845 @@
+"""
+This file includes functions to get variables metadata in the `world_bank_pip` garden step
+If new poverty lines or indicators are included, they need to be addressed here
+"""
+
+from owid.catalog import Table, VariableMeta, VariablePresentationMeta
+
+# This is text to include in description_key and description_processing fields
+
+non_market_income_description = "Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account."
+
+processing_description_relative_poverty = "Measures of relative poverty are not directly available in the World Bank PIP data. To calculate this metric we take the median income or consumption for the country and year, calculate a relative poverty line – in this case {povline} of the median – and then run a specific query on the PIP API to return the share of population below that line."
+
+processing_description_thr = "Income and consumption thresholds by decile are not directly available in the World Bank PIP API. We extract the metric primarily from [auxiliary percentiles data provided by the World Bank](https://datacatalog.worldbank.org/search/dataset/0063646). Missing country values and regional aggregations of the indicator are calculated by running multiple queries on the API to obtain the closest poverty line to each threshold."
+
+processing_description_avg = "Income and consumption averages by decile are not directly available in the World Bank PIP API. We calculate the metric by multiplying the share of each decile by the mean income or consumption of the distribution and dividing by the population share of the decile (10%)."
+
+relative_poverty_description = "This is a measure of _relative_ poverty – it captures the share of people whose income is low by the standards typical in their own country."
+
+ppp_description = "The data is measured in international-$ at {ppp} prices – this adjusts for inflation and for differences in the cost of living between countries."
+
+processing_description_thr_percentiles = "Missing country values and regional aggregations of the threshold indicator are calculated by running multiple queries on the API to obtain the closest poverty line to each threshold. This data is merged with the percentile files [provided by the World Bank](https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles)."
+
+
+# Define default tolerance for each variable
+TOLERANCE = 5
+
+# These are parameters specifically defined for each type of variable
+var_dict = {
+    # POVERTY
+    "headcount": {
+        "title": "Number in poverty",
+        "description": "Number of people in households with an {inc_cons_dict[wel]['name']} per person below {povline}",
+        "unit": "people",
+        "short_unit": "",
+        "numDecimalPlaces": 0,
+    },
+    "headcount_ratio": {
+        "title": "Share of population in poverty",
+        "description": "Percentage of population living in households with an {inc_cons_dict[wel]['name']} per person below {povline}",
+        "unit": "%",
+        "short_unit": "%",
+        "numDecimalPlaces": 1,
+    },
+    "total_shortfall": {
+        "title": "Total daily shortfall",
+        "description": "This is the amount of money that would be theoretically needed to lift the {inc_cons_dict[wel]['name']} of all people in poverty up to {povline}. However, this is not a measure of the actual cost of eliminating poverty, since it does not take into account the costs involved in making the necessary transfers nor any changes in behaviour they would bring about.",
+        "unit": "international-$ in {ppp} prices",
+        "short_unit": "$",
+        "numDecimalPlaces": 2,
+    },
+    "avg_shortfall": {
+        "title": "Average shortfall ($)",
+        "description": "This is the amount of money that would be theoretically needed to lift the {inc_cons_dict[wel]['name']} of all people in poverty up to {povline}, averaged across the population in poverty.",
+        "unit": "international-$ in {ppp} prices",
+        "short_unit": "$",
+        "numDecimalPlaces": 2,
+    },
+    "income_gap_ratio": {
+        "title": "Average shortfall (%)",
+        "description": "This is the average shortfall expressed as a share of the poverty line, sometimes called the 'income gap ratio'. It captures the depth of poverty of those living on less than {povline}.",
+        "unit": "%",
+        "short_unit": "%",
+        "numDecimalPlaces": 1,
+    },
+    "poverty_gap_index": {
+        "title": "Poverty gap index",
+        "description": "The poverty gap index is a poverty measure that reflects both the prevalence and the depth of poverty. It is calculated as the share of population in poverty multiplied by the average shortfall from the poverty line (expressed as a % of the poverty line).",
+        "unit": "%",
+        "short_unit": "%",
+        "numDecimalPlaces": 1,
+    },
+    "watts": {
+        "title": "Watts index",
+        "description": "This is the mean across the population of the proportionate poverty gaps, as measured by the log of the ratio of the poverty line to income, where the mean is formed over the whole population, counting the nonpoor as having a zero poverty gap.",
+        "unit": "%",
+        "short_unit": "%",
+        "numDecimalPlaces": 1,
+    },
+    "poverty_severity": {
+        "title": "Poverty severity",
+        "description": "It is calculated as the square of the income gap ratio, the average shortfall expressed as a share of the poverty line.",
+        "unit": "%",
+        "short_unit": "%",
+        "numDecimalPlaces": 1,
+    },
+    # INEQUALITY
+    "gini": {
+        "title": "Gini coefficient",
+        "description": "The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality.",
+        "unit": "",
+        "short_unit": "",
+        "numDecimalPlaces": 2,
+    },
+    "palma_ratio": {
+        "title": "Palma ratio",
+        "description": "The Palma ratio is a measure of inequality that divides the share received by the richest 10% by the share of the poorest 40%. Higher values indicate higher inequality.",
+        "unit": "",
+        "short_unit": "",
+        "numDecimalPlaces": 1,
+    },
+    "s80_s20_ratio": {
+        "title": "S80/S20 ratio",
+        "description": "The S80/S20 ratio is a measure of inequality that divides the share received by the richest 20% by the share of the poorest 20%. Higher values indicate higher inequality.",
+        "unit": "",
+        "short_unit": "",
+        "numDecimalPlaces": 1,
+    },
+    "p90_p10_ratio": {
+        "title": "P90/P10 ratio",
+        "description": "P90 and P10 are the levels of {inc_cons_dict[wel]['name']} below which 90% and 10% of the population live, respectively. This variable gives the ratio of the two. It is a measure of inequality that indicates the gap between the richest and poorest tenth of the population.",
+        "unit": "",
+        "short_unit": "",
+        "numDecimalPlaces": 1,
+    },
+    "p90_p50_ratio": {
+        "title": "P90/P50 ratio",
+        "description": "The P90/P50 ratio measures the degree of inequality within the richest half of the population. A ratio of 2 means that someone just falling in the richest tenth of the population has twice the median {inc_cons_dict[wel]['name']}.",
+        "unit": "",
+        "short_unit": "",
+        "numDecimalPlaces": 1,
+    },
+    "p50_p10_ratio": {
+        "title": "P50/P10 ratio",
+        "description": "The P50/P10 ratio measures the degree of inequality within the poorest half of the population. A ratio of 2 means that the median {inc_cons_dict[wel]['name']} is two times higher than that of someone just falling in the poorest tenth of the population.",
+        "unit": "",
+        "short_unit": "",
+        "numDecimalPlaces": 1,
+    },
+    "mld": {
+        "title": "Mean log deviation",
+        "description": "The mean log deviation (MLD) is a measure of inequality. An MLD of zero indicates perfect equality and it takes on larger positive values as incomes become more unequal.",
+        "unit": "",
+        "short_unit": "",
+        "numDecimalPlaces": 2,
+    },
+    "polarization": {
+        "title": "Polarization index",
+        "description": "The polarization index, also known as the Wolfson polarization index, measures the extent to which the distribution of income or consumption is “spread out” and bi-modal. Like the Gini coefficient, the polarization index ranges from 0 (no polarization) to 1 (complete polarization).",
+        "unit": "",
+        "short_unit": "",
+        "numDecimalPlaces": 2,
+    },
+    # DISTRIBUTIONAL INDICATORS
+    "mean": {
+        "title": "Mean",
+        "description": "Mean {inc_cons_dict[wel]['name']}.",
+        "unit": "international-$ in {ppp} prices",
+        "short_unit": "$",
+        "numDecimalPlaces": 2,
+    },
+    "median": {
+        "title": "Median",
+        "description": "Median {inc_cons_dict[wel]['name']}.",
+        "unit": "international-$ in {ppp} prices",
+        "short_unit": "$",
+        "numDecimalPlaces": 2,
+    },
+    "avg": {
+        "title": "Average",
+        "description": "The mean {inc_cons_dict[wel]['name_distribution']} per year within the {pct_dict[pct]['decile10']} (tenth of the population).",
+        "unit": "international-$ in {ppp} prices",
+        "short_unit": "$",
+        "numDecimalPlaces": 2,
+    },
+    "share": {
+        "title": "Share",
+        "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the {pct_dict[pct]['decile10']} (tenth of the population).",
+        "unit": "%",
+        "short_unit": "%",
+        "numDecimalPlaces": 1,
+    },
+    "thr": {
+        "title": "Threshold",
+        "description": "The level of {inc_cons_dict[wel]['name_distribution']} per year below which {str(pct)}% of the population falls.",
+        "unit": "international-$ in {ppp} prices",
+        "short_unit": "$",
+        "numDecimalPlaces": 2,
+    },
+    "bottom50_share": {
+        "title": "Share of the bottom 50%",
+        "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the poorest 50%.",
+        "unit": "%",
+        "short_unit": "%",
+        "numDecimalPlaces": 1,
+    },
+    "middle40_share": {
+        "title": "Share of the middle 40%",
+        "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the middle 40%. The middle 40% is the share of the population whose {inc_cons_dict[wel]['name']} lies between the poorest 50% and the richest 10%.",
+        "unit": "%",
+        "short_unit": "%",
+        "numDecimalPlaces": 1,
+    },
+}
+
+# Details for each consumption or income variable
+inc_cons_dict = {
+    "income": {
+        "name": "income",
+        "name_distribution": "after tax income",
+        "verb": "received",
+        "description": "The data relates to income measured after taxes and benefits per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children).",
+        "processing_description": """To construct a global dataset, the World Bank combines estimates based on income data and estimates based on consumption data. Here we only include the estimates based on income data.
+
+You can find the data with all available income and consumption data points in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""",
+    },
+    "consumption": {
+        "name": "consumption",
+        "name_distribution": "consumption",
+        "verb": "spent",
+        "description": "The data relates to consumption per capita. 'Per capita' means that the consumption of each household is attributed equally to each member of the household (including children).",
+        "processing_description": """To construct a global dataset, the World Bank combines estimates based on income data and estimates based on consumption data. Here we only include the estimates based on consumption data.
+
+You can find the data with all available income and consumption data points in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""",
+    },
+    "income_consumption": {
+        "name": "income or consumption",
+        "name_distribution": "after tax income or consumption",
+        "verb": "received",
+        "description": "Depending on the country and year, the data relates to income measured after taxes and benefits, or to consumption, per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children).",
+        "processing_description": """For a small number of country-year observations, the World Bank PIP data contains two estimates: one based on income data and one based on consumption data. In these cases we keep only the consumption estimate in order to obtain a single series for each country.
+
+To avoid data misinterpretations, we dropped income estimates for absolute poverty in Poland from 2020 onwards because they differ considerably from the consumption estimates kept from previous years.
+
+You can find the data with all available income and consumption data points, including these overlapping estimates, in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""",
+    },
+}
+
+# Details for each relative poverty line
+rel_dict = {40: "40% of the median", 50: "50% of the median", 60: "60% of the median"}
+
+# Details for each absolute poverty line
+abs_dict = {
+    2011: {
+        100: {"title": "$1 a day", "title_between": "$1", "description_key": ""},
+        190: {
+            "title": "$1.90 a day",
+            "title_between": "$1.90",
+            "description_key": "Extreme poverty here is defined as living below the International Poverty Line of $1.90 per day.",
+        },
+        320: {
+            "title": "$3.20 a day",
+            "title_between": "$3.20",
+            "description_key": "A poverty line of $3.20 a day represents definitions of national poverty lines in lower-middle-income countries.",
+        },
+        550: {
+            "title": "$5.50 a day",
+            "title_between": "$5.50",
+            "description_key": "A poverty line of $5.50 a day represents definitions of national poverty lines in upper-middle-income countries.",
+        },
+        1000: {
+            "title": "$10 a day",
+            "title_between": "$10",
+            "description_key": "",
+        },
+        2000: {
+            "title": "$20 a day",
+            "title_between": "$20",
+            "description_key": "",
+        },
+        3000: {
+            "title": "$30 a day",
+            "title_between": "$30",
+            "description_key": "A poverty line of $30 a day represents definitions of national poverty lines in high-income countries.",
+        },
+        4000: {
+            "title": "$40 a day",
+            "title_between": "$40",
+            "description_key": "",
+        },
+    },
+    2017: {
+        100: {"title": "$1 a day", "title_between": "$1", "description_key": ""},
+        215: {
+            "title": "$2.15 a day",
+            "title_between": "$2.15",
+            "description_key": "Extreme poverty here is defined as living below the International Poverty Line of $2.15 per day.",
+        },
+        365: {
+            "title": "$3.65 a day",
+            "title_between": "$3.65",
+            "description_key": "A poverty line of $3.65 a day represents definitions of national poverty lines in lower-middle-income countries.",
+        },
+        685: {
+            "title": "$6.85 a day",
+            "title_between": "$6.85",
+            "description_key": "A poverty line of $6.85 a day represents definitions of national poverty lines in upper-middle-income countries.",
+        },
+        1000: {
+            "title": "$10 a day",
+            "title_between": "$10",
+            "description_key": "",
+        },
+        2000: {
+            "title": "$20 a day",
+            "title_between": "$20",
+            "description_key": "",
+        },
+        3000: {
+            "title": "$30 a day",
+            "title_between": "$30",
+            "description_key": "A poverty line of $30 a day represents definitions of national poverty lines in high-income countries.",
+        },
+        4000: {
+            "title": "$40 a day",
+            "title_between": "$40",
+            "description_key": "",
+        },
+    },
+}
+
+# Details for naming each decile/percentile
+pct_dict = {
+    1: {"decile10": "Poorest decile", "decile9": "Poorest decile"},
+    2: {"decile10": "2nd decile", "decile9": "2nd decile"},
+    3: {"decile10": "3rd decile", "decile9": "3rd decile"},
+    4: {"decile10": "4th decile", "decile9": "4th decile"},
+    5: {"decile10": "5th decile", "decile9": "5th decile"},
+    6: {"decile10": "6th decile", "decile9": "6th decile"},
+    7: {"decile10": "7th decile", "decile9": "7th decile"},
+    8: {"decile10": "8th decile", "decile9": "8th decile"},
+    9: {"decile10": "9th decile", "decile9": "Richest decile"},
+    10: {"decile10": "Richest decile", "decile9": ""},
+}
+
+
+def add_metadata_vars(tb_garden: Table, ppp_version: int, welfare_type: str) -> Table:
+    """
+    Add metadata for each variable in the dataset, using the dictionaries above and the functions below
+    """
+
+    # Add short name
+    tb_garden.metadata.short_name = f"{welfare_type}_{ppp_version}"
+
+    # Create a list from abs_dict
+    povline_list = list(abs_dict[ppp_version].keys())
+
+    # Get a list of all the variables available
+    cols = list(tb_garden.columns)
+
+    for var in var_dict:
+        # For variables uniquely defined for each country-year-welfare type-reporting level (mostly inequality indicators + mean and median)
+        col_name = f"{var}"
+
+        if col_name in cols:
+            # Get the origins of the variable
+            origins = tb_garden[col_name].metadata.origins
+
+            # Create metadata for these variables
+            tb_garden[col_name].metadata = var_metadata_inequality_mean_median(var, origins, welfare_type)
+
+            # Replace placeholders
+            tb_garden[col_name].metadata.description_short = (
+                tb_garden[col_name]
+                .metadata.description_short.replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"])
+                .replace("{inc_cons_dict[wel]['name_distribution']}", inc_cons_dict[welfare_type]["name_distribution"])
+                .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"])
+            )
+
+            tb_garden[col_name].metadata.description_key = [
+                ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key
+            ]
+
+            tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version))
+
+        for povline in povline_list:
+            # For variables that use absolute poverty lines
+            col_name = f"{var}_{povline}"
+
+            if col_name in cols:
+                # Get the origins of the variable
+                origins = tb_garden[col_name].metadata.origins
+
+                # Create metadata for these variables
+                tb_garden[col_name].metadata = var_metadata_absolute_povlines(
+                    var, povline, origins, ppp_version, welfare_type
+                )
+
+                # Replace placeholders
+                tb_garden[col_name].metadata.description_short = (
+                    tb_garden[col_name]
+                    .metadata.description_short.replace("{povline}", abs_dict[ppp_version][povline]["title"])
+                    .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"])
+                )
+
+                tb_garden[col_name].metadata.description_key = [
+                    ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key
+                ]
+
+                tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version))
+
+            # For variables above poverty lines
+            col_name = f"{var}_above_{povline}"
+
+            if col_name in cols:
+                # Get the origins of the variable
+                origins = tb_garden[col_name].metadata.origins
+
+                # Create metadata for these variables
+                tb_garden[col_name].metadata = var_metadata_absolute_povlines(
+                    var, povline, origins, ppp_version, welfare_type
+                )
+
+                # Replace placeholders
+                tb_garden[col_name].metadata.description_short = (
+                    tb_garden[col_name]
+                    .metadata.description_short.replace("{povline}", abs_dict[ppp_version][povline]["title"])
+                    .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"])
+                )
+
+                tb_garden[col_name].metadata.description_key = [
+                    ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key
+                ]
+
+                tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version))
+
+                # Replace "below" with "above" in the description
+                tb_garden[col_name].metadata.description_short = tb_garden[col_name].metadata.description_short.replace(
+                    "below", "above"
+                )
+
+                # Replace "in poverty" with "not in poverty" in the title
+                tb_garden[col_name].metadata.title = tb_garden[col_name].metadata.title.replace(
+                    "in poverty", "not in poverty"
+                )
+
+                # Replicate the title in the display name and title_public
+                tb_garden[col_name].metadata.display["name"] = tb_garden[col_name].metadata.title
+                tb_garden[col_name].metadata.presentation = VariablePresentationMeta(
+                    title_public=tb_garden[col_name].metadata.title
+                )
+
+        for i in range(len(povline_list)):
+            if i != 0:
+                # For variables between poverty lines
+                col_name = f"{var}_between_{povline_list[i-1]}_{povline_list[i]}"
+
+                if col_name in cols:
+                    # Get the origins of the variable
+                    origins = tb_garden[col_name].metadata.origins
+
+                    # Create metadata for these variables
+                    tb_garden[col_name].metadata = var_metadata_between_absolute_povlines(
+                        var, povline_list[i - 1], povline_list[i], origins, ppp_version, welfare_type
+                    )
+
+        # For variables between poverty lines that jump the original order
+        col_name = f"{var}_between_{povline_list[1]}_{povline_list[4]}"
+
+        if col_name in cols:
+            # Get the origins of the variable
+            origins = tb_garden[col_name].metadata.origins
+
+            # Create metadata for these variables
+            tb_garden[col_name].metadata = var_metadata_between_absolute_povlines(
+                var, povline_list[1], povline_list[4], origins, ppp_version, welfare_type
+            )
+
+        col_name = f"{var}_between_{povline_list[4]}_{povline_list[6]}"
+
+        if col_name in cols:
+            # Get the origins of the variable
+            origins = tb_garden[col_name].metadata.origins
+
+            # Create metadata for these variables
+            tb_garden[col_name].metadata = var_metadata_between_absolute_povlines(
+                var, povline_list[4], povline_list[6], origins, ppp_version, welfare_type
+            )
+
+        for rel in rel_dict:
+            # For variables that use relative poverty lines
+            col_name = f"{var}_{rel}_median"
+
+            if col_name in cols:
+                # Get the origins of the variable
+                origins = tb_garden[col_name].metadata.origins
+
+                # Create metadata for these variables
+                tb_garden[col_name].metadata = var_metadata_relative_povlines(var, rel, origins, welfare_type)
+
+                # Replace placeholders
+                tb_garden[col_name].metadata.description_short = (
+                    tb_garden[col_name]
+                    .metadata.description_short.replace("{povline}", rel_dict[rel])
+                    .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"])
+                )
+
+        for pct in pct_dict:
+            # For variables that use percentiles (deciles)
+            col_name = f"decile{pct}_{var}"
+
+            if col_name in cols:
+                # Get the origins of the variable
+                origins = tb_garden[col_name].metadata.origins
+
+                # Create metadata for these variables
+                tb_garden[col_name].metadata = var_metadata_percentiles(var, pct, origins, ppp_version, welfare_type)
+
+                # Replace placeholders
+                tb_garden[col_name].metadata.description_short = (
+                    tb_garden[col_name]
+                    .metadata.description_short.replace("{str(pct)}", f"{str(pct)}0")
+                    .replace(
+                        "{inc_cons_dict[wel]['name_distribution']}",
+                        inc_cons_dict[welfare_type]["name_distribution"],
+                    )
+                    .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"])
+                    .replace("{pct_dict[pct]['decile10']}", pct_dict[pct]["decile10"].lower())
+                )
+
+                tb_garden[col_name].metadata.description_key = [
+                    ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key
+                ]
+                tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version))
+
+    return tb_garden
+
+
+# Metadata functions to show a clearer main code
+def var_metadata_inequality_mean_median(var, origins, welfare_type) -> VariableMeta:
+    """
+    Create metadata for defined uniquely by their name
+    """
+    # For monetary variables I include PPP description
+    if var in ["mean", "median"]:
+        meta = VariableMeta(
+            title=f"{var_dict[var]['title']} {inc_cons_dict[welfare_type]['name']}",
+            description_short=var_dict[var]["description"],
+            description_key=[
+                ppp_description,
+                inc_cons_dict[welfare_type]["description"],
+                non_market_income_description,
+            ],
+            description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""",
+            unit=var_dict[var]["unit"],
+            short_unit=var_dict[var]["short_unit"],
+            origins=origins,
+        )
+        meta.display = {
+            "name": meta.title,
+            "numDecimalPlaces": var_dict[var]["numDecimalPlaces"],
+            "tolerance": TOLERANCE,
+        }
+
+        meta.presentation = VariablePresentationMeta(title_public=meta.title)
+
+    else:
+        meta = VariableMeta(
+            title=f"{var_dict[var]['title']}",
+            description_short=var_dict[var]["description"],
+            description_key=[
+                inc_cons_dict[welfare_type]["description"],
+                non_market_income_description,
+            ],
+            description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""",
+            unit=var_dict[var]["unit"],
+            short_unit=var_dict[var]["short_unit"],
+            origins=origins,
+        )
+        meta.display = {
+            "name": meta.title,
+            "numDecimalPlaces": var_dict[var]["numDecimalPlaces"],
+            "tolerance": TOLERANCE,
+        }
+
+        meta.presentation = VariablePresentationMeta(title_public=meta.title)
+
+    return meta
+
+
+def var_metadata_absolute_povlines(var, povline, origins, ppp_version, welfare_type) -> VariableMeta:
+    """
+    Create metadata for variables with absolute poverty lines
+    """
+    # Define the list of description_key, to then remove the empty ones
+    description_key_list = [
+        abs_dict[ppp_version][povline]["description_key"],
+        ppp_description,
+        inc_cons_dict[welfare_type]["description"],
+        non_market_income_description,
+    ]
+
+    # Remove empty strings from the list
+    description_key_list = list(filter(None, description_key_list))
+
+    meta = VariableMeta(
+        title=f"{abs_dict[ppp_version][povline]['title']} - {var_dict[var]['title']}",
+        description_short=var_dict[var]["description"],
+        description_key=description_key_list,
+        description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""",
+        unit=var_dict[var]["unit"],
+        short_unit=var_dict[var]["short_unit"],
+        origins=origins,
+    )
+    meta.display = {
+        "name": meta.title,
+        "numDecimalPlaces": var_dict[var]["numDecimalPlaces"],
+        "tolerance": TOLERANCE,
+    }
+
+    meta.presentation = VariablePresentationMeta(title_public=meta.title)
+
+    return meta
+
+
+def var_metadata_between_absolute_povlines(var, povline1, povline2, origins, ppp_version, welfare_type) -> VariableMeta:
+    """
+    Create metadata for variables between poverty lines
+    """
+
+    meta = VariableMeta(
+        title=f"{abs_dict[ppp_version][povline1]['title_between']}-{abs_dict[ppp_version][povline2]['title_between']} - {var_dict[var]['title']}",
+        description_short=var_dict[var]["description"],
+        description_key=[
+            ppp_description,
+            inc_cons_dict[welfare_type]["description"],
+            non_market_income_description,
+        ],
+        description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""",
+        unit=var_dict[var]["unit"].replace("{ppp}", str(ppp_version)),
+        short_unit=var_dict[var]["short_unit"],
+        origins=origins,
+    )
+    meta.display = {
+        "name": meta.title,
+        "numDecimalPlaces": var_dict[var]["numDecimalPlaces"],
+        "tolerance": TOLERANCE,
+    }
+
+    meta.presentation = VariablePresentationMeta(title_public=meta.title)
+
+    meta.description_short = meta.description_short.replace(
+        "{povline}",
+        f"living between {abs_dict[ppp_version][povline1]['title_between']} and {abs_dict[ppp_version][povline2]['title_between']} a day",
+    ).replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"])
+
+    meta.description_key = [ppp.replace("{ppp}", str(ppp_version)) for ppp in meta.description_key]
+
+    meta.unit = meta.unit.replace("{ppp}", str(ppp_version))
+
+    return meta
+
+
+def var_metadata_relative_povlines(var, rel, origins, welfare_type) -> VariableMeta:
+    """
+    Create metadata for variables with relative poverty lines
+    """
+
+    meta = VariableMeta(
+        title=f"{rel_dict[rel]} - {var_dict[var]['title']}",
+        description_short=var_dict[var]["description"],
+        description_key=[
+            relative_poverty_description,
+            inc_cons_dict[welfare_type]["description"],
+            non_market_income_description,
+        ],
+        description_processing=f"""{processing_description_relative_poverty}
+
+{inc_cons_dict[welfare_type]['processing_description']}""",
+        unit=var_dict[var]["unit"],
+        short_unit=var_dict[var]["short_unit"],
+        origins=origins,
+    )
+    meta.display = {
+        "name": meta.title,
+        "numDecimalPlaces": var_dict[var]["numDecimalPlaces"],
+        "tolerance": TOLERANCE,
+    }
+
+    meta.presentation = VariablePresentationMeta(title_public=meta.title)
+
+    return meta
+
+
+def var_metadata_percentiles(var, pct, origins, ppp_version, welfare_type) -> VariableMeta:
+    """
+    Create metadata for variables with percentiles
+    """
+
+    if var == "thr":
+        meta = VariableMeta(
+            title=f"{pct_dict[pct]['decile9']} - {var_dict[var]['title']}",
+            description_short=var_dict[var]["description"],
+            description_key=[
+                ppp_description,
+                inc_cons_dict[welfare_type]["description"],
+                non_market_income_description,
+            ],
+            description_processing=f"""{processing_description_thr}
+
+{inc_cons_dict[welfare_type]['processing_description']}""",
+            unit=var_dict[var]["unit"],
+            short_unit=var_dict[var]["short_unit"],
+            origins=origins,
+        )
+
+    elif var == "avg":
+        meta = VariableMeta(
+            title=f"{pct_dict[pct]['decile10']} - {var_dict[var]['title']}",
+            description_short=var_dict[var]["description"],
+            description_key=[
+                ppp_description,
+                inc_cons_dict[welfare_type]["description"],
+                non_market_income_description,
+            ],
+            description_processing=f"""{processing_description_avg}
+
+{inc_cons_dict[welfare_type]['processing_description']}""",
+            unit=var_dict[var]["unit"],
+            short_unit=var_dict[var]["short_unit"],
+            origins=origins,
+        )
+
+    # For shares
+    else:
+        meta = VariableMeta(
+            title=f"{pct_dict[pct]['decile10']} - {var_dict[var]['title']}",
+            description_short=var_dict[var]["description"],
+            description_key=[
+                inc_cons_dict[welfare_type]["description"],
+                non_market_income_description,
+            ],
+            description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""",
+            unit=var_dict[var]["unit"],
+            short_unit=var_dict[var]["short_unit"],
+            origins=origins,
+        )
+
+    meta.display = {
+        "name": meta.title,
+        "numDecimalPlaces": var_dict[var]["numDecimalPlaces"],
+        "tolerance": TOLERANCE,
+    }
+
+    meta.presentation = VariablePresentationMeta(title_public=meta.title)
+
+    return meta
+
+
+# FOR PERCENTILES
+def add_metadata_vars_percentiles(tb_garden: Table, ppp_version: int, welfare_type: str) -> Table:
+    """
+    Add metadata for each variable in the dataset, using the dictionaries above and the functions below
+    This is done for the percentile tables
+    """
+
+    # Add short name
+    tb_garden.metadata.short_name = f"percentiles_{welfare_type}_{ppp_version}"
+
+    # Get a list of all the variables available
+    cols = list(tb_garden.columns)
+
+    for var in var_dict:
+        # For variables uniquely defined for each country-year-welfare type-reporting level (mostly inequality indicators + mean and median)
+        col_name = f"{var}"
+
+        if col_name in cols:
+            # Get the origins of the variable
+            origins = tb_garden[col_name].metadata.origins
+
+            # Create metadata for these variables
+            tb_garden[col_name].metadata = var_metadata_percentile_table(var, origins, welfare_type)
+
+            # Replace placeholders
+            tb_garden[col_name].metadata.description_short = (
+                tb_garden[col_name]
+                .metadata.description_short.replace("{str(pct)}", "each 1")
+                .replace(
+                    "{inc_cons_dict[wel]['name_distribution']}",
+                    inc_cons_dict[welfare_type]["name_distribution"],
+                )
+                .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"])
+                .replace(
+                    "the {pct_dict[pct]['decile10']} (tenth of the population)",
+                    "each percentile (hundredth of the population)",
+                )
+            )
+            tb_garden[col_name].metadata.description_key = [
+                ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key
+            ]
+
+            tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version))
+
+    return tb_garden
+
+
+def var_metadata_percentile_table(var, origins, welfare_type) -> VariableMeta:
+    """
+    Create metadata for variables with percentiles
+    """
+
+    if var == "thr":
+        meta = VariableMeta(
+            title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}",
+            description_short=var_dict[var]["description"],
+            description_key=[
+                ppp_description,
+                inc_cons_dict[welfare_type]["description"],
+                non_market_income_description,
+            ],
+            description_processing=f"""{processing_description_thr_percentiles}""",
+            unit=var_dict[var]["unit"],
+            short_unit=var_dict[var]["short_unit"],
+            origins=origins,
+        )
+
+    elif var == "avg":
+        meta = VariableMeta(
+            title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}",
+            description_short=var_dict[var]["description"],
+            description_key=[
+                ppp_description,
+                inc_cons_dict[welfare_type]["description"],
+                non_market_income_description,
+            ],
+            description_processing="",
+            unit=var_dict[var]["unit"],
+            short_unit=var_dict[var]["short_unit"],
+            origins=origins,
+        )
+
+    # For shares
+    else:
+        meta = VariableMeta(
+            title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}",
+            description_short=var_dict[var]["description"],
+            description_key=[
+                inc_cons_dict[welfare_type]["description"],
+                non_market_income_description,
+            ],
+            description_processing="",
+            unit=var_dict[var]["unit"],
+            short_unit=var_dict[var]["short_unit"],
+            origins=origins,
+        )
+
+    meta.display = {
+        "name": meta.title,
+        "numDecimalPlaces": var_dict[var]["numDecimalPlaces"],
+        "tolerance": TOLERANCE,
+    }
+
+    meta.presentation = VariablePresentationMeta(title_public=meta.title)
+
+    return meta
diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
new file mode 100644
index 00000000000..73342a8a395
--- /dev/null
+++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
@@ -0,0 +1,181 @@
+{
+  "Albania": "Albania",
+  "Algeria": "Algeria",
+  "Angola": "Angola",
+  "Argentina": "Argentina",
+  "Armenia": "Armenia",
+  "Australia": "Australia",
+  "Austria": "Austria",
+  "Azerbaijan": "Azerbaijan",
+  "Bangladesh": "Bangladesh",
+  "Belarus": "Belarus",
+  "Belgium": "Belgium",
+  "Belize": "Belize",
+  "Benin": "Benin",
+  "Bhutan": "Bhutan",
+  "Bolivia": "Bolivia",
+  "Bosnia and Herzegovina": "Bosnia and Herzegovina",
+  "Botswana": "Botswana",
+  "Brazil": "Brazil",
+  "Bulgaria": "Bulgaria",
+  "Burkina Faso": "Burkina Faso",
+  "Burundi": "Burundi",
+  "Cabo Verde": "Cape Verde",
+  "Cameroon": "Cameroon",
+  "Canada": "Canada",
+  "Central African Republic": "Central African Republic",
+  "Chad": "Chad",
+  "Chile": "Chile",
+  "China": "China",
+  "Colombia": "Colombia",
+  "Comoros": "Comoros",
+  "Congo, Dem. Rep.": "Democratic Republic of Congo",
+  "Congo, Rep.": "Congo",
+  "Costa Rica": "Costa Rica",
+  "Cote d'Ivoire": "Cote d'Ivoire",
+  "Croatia": "Croatia",
+  "Cyprus": "Cyprus",
+  "Czechia": "Czechia",
+  "Denmark": "Denmark",
+  "Djibouti": "Djibouti",
+  "Dominican Republic": "Dominican Republic",
+  "Ecuador": "Ecuador",
+  "Egypt, Arab Rep.": "Egypt",
+  "El Salvador": "El Salvador",
+  "Estonia": "Estonia",
+  "Eswatini": "Eswatini",
+  "Ethiopia": "Ethiopia",
+  "Fiji": "Fiji",
+  "Finland": "Finland",
+  "France": "France",
+  "Gabon": "Gabon",
+  "Gambia, The": "Gambia",
+  "Georgia": "Georgia",
+  "Germany": "Germany",
+  "Ghana": "Ghana",
+  "Greece": "Greece",
+  "Grenada": "Grenada",
+  "Guatemala": "Guatemala",
+  "Guinea": "Guinea",
+  "Guinea-Bissau": "Guinea-Bissau",
+  "Guyana": "Guyana",
+  "Haiti": "Haiti",
+  "Honduras": "Honduras",
+  "Hungary": "Hungary",
+  "Iceland": "Iceland",
+  "India": "India",
+  "Indonesia": "Indonesia",
+  "Iran, Islamic Rep.": "Iran",
+  "Iraq": "Iraq",
+  "Ireland": "Ireland",
+  "Israel": "Israel",
+  "Italy": "Italy",
+  "Jamaica": "Jamaica",
+  "Japan": "Japan",
+  "Jordan": "Jordan",
+  "Kazakhstan": "Kazakhstan",
+  "Kenya": "Kenya",
+  "Kiribati": "Kiribati",
+  "Korea, Rep.": "South Korea",
+  "Kosovo": "Kosovo",
+  "Kyrgyz Republic": "Kyrgyzstan",
+  "Lao PDR": "Laos",
+  "Latvia": "Latvia",
+  "Lebanon": "Lebanon",
+  "Lesotho": "Lesotho",
+  "Liberia": "Liberia",
+  "Lithuania": "Lithuania",
+  "Luxembourg": "Luxembourg",
+  "Madagascar": "Madagascar",
+  "Malawi": "Malawi",
+  "Malaysia": "Malaysia",
+  "Maldives": "Maldives",
+  "Mali": "Mali",
+  "Malta": "Malta",
+  "Marshall Islands": "Marshall Islands",
+  "Mauritania": "Mauritania",
+  "Mauritius": "Mauritius",
+  "Mexico": "Mexico",
+  "Micronesia, Fed. Sts.": "Micronesia (country)",
+  "Moldova": "Moldova",
+  "Mongolia": "Mongolia",
+  "Montenegro": "Montenegro",
+  "Morocco": "Morocco",
+  "Mozambique": "Mozambique",
+  "Myanmar": "Myanmar",
+  "Namibia": "Namibia",
+  "Nauru": "Nauru",
+  "Nepal": "Nepal",
+  "Netherlands": "Netherlands",
+  "Nicaragua": "Nicaragua",
+  "Niger": "Niger",
+  "Nigeria": "Nigeria",
+  "North Macedonia": "North Macedonia",
+  "Norway": "Norway",
+  "Pakistan": "Pakistan",
+  "Panama": "Panama",
+  "Papua New Guinea": "Papua New Guinea",
+  "Paraguay": "Paraguay",
+  "Peru": "Peru",
+  "Philippines": "Philippines",
+  "Poland": "Poland",
+  "Portugal": "Portugal",
+  "Romania": "Romania",
+  "Russian Federation": "Russia",
+  "Rwanda": "Rwanda",
+  "Samoa": "Samoa",
+  "Sao Tome and Principe": "Sao Tome and Principe",
+  "Senegal": "Senegal",
+  "Serbia": "Serbia",
+  "Seychelles": "Seychelles",
+  "Sierra Leone": "Sierra Leone",
+  "Slovak Republic": "Slovakia",
+  "Slovenia": "Slovenia",
+  "Solomon Islands": "Solomon Islands",
+  "South Africa": "South Africa",
+  "South Sudan": "South Sudan",
+  "Spain": "Spain",
+  "Sri Lanka": "Sri Lanka",
+  "St. Lucia": "Saint Lucia",
+  "Sudan": "Sudan",
+  "Suriname": "Suriname",
+  "Sweden": "Sweden",
+  "Switzerland": "Switzerland",
+  "Syrian Arab Republic": "Syria",
+  "Tajikistan": "Tajikistan",
+  "Tanzania": "Tanzania",
+  "Thailand": "Thailand",
+  "Timor-Leste": "East Timor",
+  "Togo": "Togo",
+  "Tonga": "Tonga",
+  "Trinidad and Tobago": "Trinidad and Tobago",
+  "Tunisia": "Tunisia",
+  "Turkmenistan": "Turkmenistan",
+  "Tuvalu": "Tuvalu",
+  "Uganda": "Uganda",
+  "Ukraine": "Ukraine",
+  "United Arab Emirates": "United Arab Emirates",
+  "United Kingdom": "United Kingdom",
+  "United States": "United States",
+  "Uruguay": "Uruguay",
+  "Uzbekistan": "Uzbekistan",
+  "Vanuatu": "Vanuatu",
+  "Venezuela, RB": "Venezuela",
+  "Viet Nam": "Vietnam",
+  "West Bank and Gaza": "Palestine",
+  "World": "World",
+  "Yemen, Rep.": "Yemen",
+  "Zambia": "Zambia",
+  "Zimbabwe": "Zimbabwe",
+  "East Asia & Pacific": "East Asia and Pacific (PIP)",
+  "Eastern and Southern Africa": "Eastern and Southern Africa (PIP)",
+  "Europe & Central Asia": "Europe and Central Asia (PIP)",
+  "Latin America & Caribbean": "Latin America and the Caribbean (PIP)",
+  "Middle East & North Africa": "Middle East and North Africa (PIP)",
+  "Other High Income Countries": "Other high income countries (PIP)",
+  "South Asia": "South Asia (PIP)",
+  "Sub-Saharan Africa": "Sub-Saharan Africa (PIP)",
+  "Taiwan, China": "Taiwan",
+  "Turkiye": "Turkey",
+  "Western and Central Africa": "Western and Central Africa (PIP)"
+}
\ No newline at end of file
diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml
new file mode 100644
index 00000000000..a6eba01d529
--- /dev/null
+++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml
@@ -0,0 +1,473 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Poverty
+        - Economic Inequality
+        - Economic Growth
+      attribution_short: World Bank
+      grapher_config:
+        originUrl: https://ourworldindata.org/poverty
+        $schema: https://files.ourworldindata.org/schemas/grapher-schema.003.json
+    processing_level: major
+
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/
+dataset:
+  update_period_days: 180
+
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/
+tables:
+  income_consumption_2017:
+    # Learn more about the available fields:
+    # http://docs.owid.io/projects/etl/architecture/metadata/reference/indicator/
+    variables:
+      headcount_ratio_215:
+        presentation:
+          title_public: Share of population living in extreme poverty
+          faqs:
+            - fragment_id: poverty-international-poverty-line
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-international-dollars
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-regional-estimates
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: Share of population living in extreme poverty
+            subtitle: >-
+              Extreme poverty is defined as living below the International Poverty Line of
+              $2.15 per day. This data is adjusted for inflation and for differences in the
+              cost of living between countries.
+            note: >-
+              This data is expressed in [international-$](#dod:int_dollar_abbreviation) at
+              2017 prices. Depending on the country and year, it relates to income measured
+              after taxes and benefits, or to consumption, [per capita](#dod:per-capita).
+            hasMapTab: true
+            tab: map
+            variantName: Line chart
+            yAxis:
+              min: 0
+            map:
+              time: 2019
+              colorScale:
+                baseColorScheme: OrRd
+                binningStrategy: manual
+                customNumericValues:
+                  - 3
+                  - 10
+                  - 20
+                  - 30
+                  - 40
+                  - 50
+                  - 60
+                  - 70
+                  - 80
+                  - 90
+                  - 100
+            selectedEntityNames:
+              - Bangladesh
+              - Bolivia
+              - Madagascar
+              - India
+              - China
+              - Ethiopia
+
+      headcount_ratio_365:
+        presentation:
+          title_public: "Poverty: Share of population living on less than $3.65 a day"
+          faqs:
+            - fragment_id: poverty-international-poverty-line
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-international-dollars
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-regional-estimates
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: "Poverty: Share of population living on less than $3.65 a day"
+            subtitle: >-
+              The poverty line of $3.65 per day is set by the World Bank to be representative of the definitions of poverty adopted in lower-middle-income countries. This data is adjusted for inflation and for differences in the cost of living between countries.
+            note: >-
+              This data is expressed in [international-$](#dod:int_dollar_abbreviation) at
+              2017 prices. Depending on the country and year, it relates to income measured
+              after taxes and benefits, or to consumption, [per capita](#dod:per-capita).
+            hasMapTab: true
+            tab: map
+            variantName: Line chart
+            yAxis:
+              min: 0
+            map:
+              time: 2019
+              colorScale:
+                baseColorScheme: OrRd
+                binningStrategy: manual
+                customNumericValues:
+                  - 10
+                  - 20
+                  - 30
+                  - 40
+                  - 50
+                  - 60
+                  - 70
+                  - 80
+                  - 90
+                  - 100
+            selectedEntityNames:
+              - Bangladesh
+              - Bolivia
+              - Madagascar
+              - India
+              - China
+              - Ethiopia
+
+      headcount_ratio_685:
+        presentation:
+          title_public: "Poverty: Share of population living on less than $6.85 a day"
+          faqs:
+            - fragment_id: poverty-international-poverty-line
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-international-dollars
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-regional-estimates
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: "Poverty: Share of population living on less than $6.85 a day"
+            subtitle: >-
+              The poverty line of $6.85 per day is set by the World Bank to be representative of the definitions of poverty adopted in upper-middle-income countries. This data is adjusted for inflation and for differences in the cost of living between countries.
+            note: >-
+              This data is expressed in [international-$](#dod:int_dollar_abbreviation) at
+              2017 prices. Depending on the country and year, it relates to income measured
+              after taxes and benefits, or to consumption, [per capita](#dod:per-capita).
+            hasMapTab: true
+            tab: map
+            variantName: Line chart
+            yAxis:
+              min: 0
+            map:
+              time: 2019
+              colorScale:
+                baseColorScheme: OrRd
+                binningStrategy: manual
+                customNumericValues:
+                  - 10
+                  - 20
+                  - 30
+                  - 40
+                  - 50
+                  - 60
+                  - 70
+                  - 80
+                  - 90
+                  - 100
+            selectedEntityNames:
+              - Bangladesh
+              - Bolivia
+              - Madagascar
+              - India
+              - China
+              - Ethiopia
+
+
+      headcount_ratio_3000:
+        presentation:
+          title_public: Share of population living on less than $30 a day
+          faqs:
+            - fragment_id: poverty-international-dollars
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-regional-estimates
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: "Poverty: Share of population living on less than $30 a day"
+            subtitle: >-
+              This data is adjusted for inflation and for differences in the cost of living between countries.
+            note: >-
+              This data is expressed in [international-$](#dod:int_dollar_abbreviation) at 2017 prices. Depending on the country and year, it relates to income measured after taxes and benefits, or to consumption, [per capita](#dod:per-capita).
+            hasMapTab: true
+            tab: map
+            variantName: Line chart
+            yAxis:
+              min: 0
+            map:
+              time: 2019
+              colorScale:
+                baseColorScheme: OrRd
+                binningStrategy: manual
+                customNumericValues:
+                  - 10
+                  - 20
+                  - 30
+                  - 40
+                  - 50
+                  - 60
+                  - 70
+                  - 80
+                  - 90
+                  - 100
+            selectedEntityNames:
+              - Bangladesh
+              - Bolivia
+              - Madagascar
+              - India
+              - China
+              - Ethiopia
+
+      headcount_ratio_60_median:
+        presentation:
+          title_public: Share of population below 60% of median income or consumption
+          topic_tags:
+            - Poverty
+            - Economic Inequality
+          faqs:
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: 'Relative poverty: Share of people below 60% of median income'
+            subtitle: >-
+              Relative poverty is measured in terms of a poverty line that rises and falls
+              over time with average incomes — in this case set at 60% of median income.
+            note: >-
+              Depending on the country and year, the data relates to income measured after
+              taxes and benefits, or to consumption, [per capita](#dod:per-capita).
+            hasMapTab: true
+            tab: map
+            yAxis:
+              min: 0
+            colorScale:
+              baseColorScheme: OwidDistinctLines
+            map:
+              time: 2019
+              colorScale:
+                baseColorScheme: YlOrBr
+                binningStrategy: manual
+                customNumericValues:
+                  - 5
+                  - 10
+                  - 15
+                  - 20
+                  - 25
+                  - 30
+                  - 35
+            selectedEntityNames:
+              - Bangladesh
+              - Bolivia
+              - Madagascar
+              - India
+              - China
+              - Ethiopia
+
+      headcount_ratio_50_median:
+        presentation:
+          title_public: Share of population below 50% of median income or consumption
+          topic_tags:
+            - Poverty
+            - Economic Inequality
+          faqs:
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: 'Relative poverty: Share of people below 50% of median income'
+            subtitle: Relative poverty is measured in terms of a poverty line that rises and falls over time with average incomes – in this case set at 50% of median income.
+            note: >-
+              Depending on the country and year, the data relates to income measured after
+              taxes and benefits, or to consumption, [per capita](#dod:per-capita).
+            hasMapTab: true
+            tab: map
+            yAxis:
+              min: 0
+            colorScale:
+              baseColorScheme: OwidDistinctLines
+            map:
+              time: 2019
+              colorScale:
+                baseColorScheme: YlOrBr
+                binningStrategy: manual
+                customNumericValues:
+                  - 3
+                  - 6
+                  - 9
+                  - 12
+                  - 15
+                  - 18
+                  - 21
+                  - 24
+                  - 27
+            selectedEntityNames:
+              - Bangladesh
+              - Bolivia
+              - Madagascar
+              - India
+              - China
+              - Ethiopia
+
+      headcount_ratio_40_median:
+        presentation:
+          title_public: Share of population below 40% of median income or consumption
+          topic_tags:
+            - Poverty
+            - Economic Inequality
+          faqs:
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: 'Relative poverty: Share of people below 40% of median income'
+            subtitle: Relative poverty is measured in terms of a poverty line that rises and falls over time with average incomes – in this case set at 40% of median income.
+            note: >-
+              Depending on the country and year, the data relates to income measured after
+              taxes and benefits, or to consumption, [per capita](#dod:per-capita).
+            hasMapTab: true
+            tab: map
+            yAxis:
+              min: 0
+            colorScale:
+              baseColorScheme: OwidDistinctLines
+            map:
+              time: 2019
+              colorScale:
+                baseColorScheme: YlOrBr
+                binningStrategy: manual
+                customNumericValues:
+                  - 2
+                  - 4
+                  - 6
+                  - 8
+                  - 10
+                  - 12
+                  - 14
+                  - 16
+                  - 18
+                  - 20
+            selectedEntityNames:
+              - Bangladesh
+              - Bolivia
+              - Madagascar
+              - India
+              - China
+              - Ethiopia
+
+      gini:
+        presentation:
+          title_public: Gini Coefficient
+          topic_tags:
+            - Poverty
+            - Economic Inequality
+          faqs:
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: 'Income inequality: Gini coefficient'
+            subtitle: >-
+              The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1.
+              Higher values indicate higher inequality. Depending on the country and year,
+              the data relates to income measured after taxes and benefits, or to
+              consumption, [per capita](#dod:per-capita).
+            note: >-
+              Income and consumption estimates are available separately in this [Data
+              Explorer](https://ourworldindata.org/explorers/pip-inequality-explorer).
+            hasMapTab: true
+            tab: map
+            variantName: World Bank
+            originUrl: https://ourworldindata.org/economic-inequality
+            yAxis:
+              min: 0
+            map:
+              time: 2019
+              colorScale:
+                baseColorScheme: Oranges
+                binningStrategy: manual
+                customNumericMinValue: 1
+                customNumericValues:
+                  - 0.3
+                  - 0.35
+                  - 0.4
+                  - 0.45
+                  - 0.5
+                  - 0.55
+                  - 0.6
+            selectedEntityNames:
+              - Chile
+              - Brazil
+              - South Africa
+              - United States
+              - France
+              - China
+
+      headcount_215_regions:
+        title: $2.15 a day - Number in poverty (Regional aggregates)
+        unit: "people"
+        short_unit: ""
+        description_short: Number of people in households with an income or consumption per person below $2.15 a day.
+        description_key:
+          - Extreme poverty here is defined as living below the International Poverty Line of $2.15 per day.
+          - The data is measured in international-$ at 2017 prices – this adjusts for inflation and for differences in the cost of living between countries.
+          - Depending on the country and year, the data relates to income measured after taxes and benefits, or to consumption, per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children).
+          - Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account.
+        description_processing: |-
+          PIP provides regional aggregate figures for the number of people living below the International Poverty Line. Unfortunately, for certain regions and years the data survey coverage is too low and the results are suppressed. From 1990 onwards, it is only for South Asia and Sub-Saharan Africa (on different years) that regional estimates are sometimes missing.
+
+          For these years we calculate the number of poor in the region as the difference between the estimated total number of poor across the world and the sum of the number of poor across all other regions.
+
+          Prior to 1990 estimates for more than one region are missing, precluding this method.
+        display:
+          numDecimalPlaces: 0
+        presentation:
+          title_public: Total of population living in extreme poverty by world region
+          topic_tags:
+            - Poverty
+            - Economic Growth
+            - Economic Inequality
+          faqs:
+            - fragment_id: poverty-international-poverty-line
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-international-dollars
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-comparability
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+            - fragment_id: poverty-regional-estimates
+              gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw
+          grapher_config:
+            title: Total population living in extreme poverty by world region
+            subtitle: >-
+              Extreme poverty is defined as living below the International Poverty Line of
+              $2.15 per day. This data is adjusted for inflation and for differences in the
+              cost of living between countries.
+            note: >-
+              This data is expressed in [international-$](#dod:int_dollar_abbreviation) at
+              2017 prices. Depending on the country and year, it relates to income measured
+              after taxes and benefits, or to consumption, [per capita](#dod:per-capita).
+            type: StackedArea
+            addCountryMode: disabled
+            hideRelativeToggle: false
+            originUrl: https://ourworldindata.org/poverty
+            baseColorScheme: OwidCategoricalC
+            invertColorScheme: true
+            yAxis:
+              min: 0
+            selectedEntityNames:
+              - Other high income countries (PIP)
+              - Latin America and the Caribbean (PIP)
+              - East Asia and Pacific (PIP)
+              - South Asia (PIP)
+              - Middle East and North Africa (PIP)
+              - Europe and Central Asia (PIP)
+              - Sub-Saharan Africa (PIP)
+
+      surveys_past_decade:
+        title: Number of surveys in the past decade
+        unit: "surveys"
+        short_unit: ""
+        description_short: The number of income or consumption surveys available in the past decade. Each decade comprises the current year and the nine years before.
+        description_processing: |-
+          For a small number of country-year observations, the World Bank PIP data contains two estimates: one based on income data and one based on consumption data. In these cases we keep only the consumption estimate in order to obtain a single series for each country. This means the indicator is estimating the number of years at least one survey was conducted in the past decade, rather than the number of surveys.
+        display:
+          numDecimalPlaces: 0
\ No newline at end of file
diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
new file mode 100644
index 00000000000..65564efb8b7
--- /dev/null
+++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
@@ -0,0 +1,1203 @@
+"""
+Load a meadow dataset and create a garden dataset.
+
+When running this step in an update, be sure to check all the outputs and logs to ensure the data is correct.
+Also check the manual fix of Polish data to avoid weird drop in the income/consumption levels from 2020 onwards.
+(Remove metadata about this when the fix is no longer needed in inc_cons_dict["income_consumption"]["processing_description"], shared.py script)
+
+NOTE: To extract the log of the process (to review sanity checks, for example), run the following command in the terminal:
+    nohup poetry run etl run world_bank_pip > output.log 2>&1 &
+"""
+
+from typing import Tuple
+
+import numpy as np
+import owid.catalog.processing as pr
+from owid.catalog import Table
+from shared import add_metadata_vars, add_metadata_vars_percentiles
+from structlog import get_logger
+from tabulate import tabulate
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Initialize logger.
+log = get_logger()
+
+# Define absolute poverty lines used depending on PPP version
+# NOTE: Modify if poverty lines are updated from source
+povlines_dict = {
+    2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000],
+    2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000],
+}
+
+# Define regions in the dataset
+regions_list = [
+    "East Asia and Pacific (PIP)",
+    "Eastern and Southern Africa (PIP)",
+    "Europe and Central Asia (PIP)",
+    "Latin America and the Caribbean (PIP)",
+    "Middle East and North Africa (PIP)",
+    "Other high income countries (PIP)",
+    "South Asia (PIP)",
+    "Sub-Saharan Africa (PIP)",
+    "Western and Central Africa (PIP)",
+    "World",
+]
+
+# Set table format when printing
+TABLEFMT = "pretty"
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("world_bank_pip")
+
+    # Read tables from meadow dataset.
+    # Key indicators
+    tb = ds_meadow["world_bank_pip"].reset_index()
+
+    # Percentiles
+    tb_percentiles = ds_meadow["world_bank_pip_percentiles"].reset_index()
+
+    # Process data
+    # Make table wide and change column names
+    tb = process_data(tb)
+
+    # Calculate inequality measures
+    tb = calculate_inequality(tb)
+
+    # Harmonize country names
+    tb: Table = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
+    tb_percentiles: Table = geo.harmonize_countries(df=tb_percentiles, countries_file=paths.country_mapping_path)
+
+    # Show regional data from 1990 onwards
+    tb = regional_data_from_1990(tb, regions_list)
+    tb_percentiles = regional_data_from_1990(tb_percentiles, regions_list)
+
+    # Amend the entity to reflect if data refers to urban or rural only
+    tb = identify_rural_urban(tb)
+
+    # Separate out ppp and filled data from the main dataset
+    tb_2011, tb_2017 = separate_ppp_data(tb)
+    tb_percentiles_2011, tb_percentiles_2017 = separate_ppp_data(tb_percentiles)
+
+    # Create stacked variables from headcount and headcount_ratio
+    tb_2011, col_stacked_n_2011, col_stacked_pct_2011 = create_stacked_variables(
+        tb_2011, povlines_dict, ppp_version=2011
+    )
+    tb_2017, col_stacked_n_2017, col_stacked_pct_2017 = create_stacked_variables(
+        tb_2017, povlines_dict, ppp_version=2017
+    )
+
+    # Sanity checks. I don't run for percentile tables because that process was done in the extraction
+    tb_2011 = sanity_checks(
+        tb_2011, povlines_dict, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
+    )
+    tb_2017 = sanity_checks(
+        tb_2017, povlines_dict, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
+    )
+
+    # Separate out consumption-only, income-only. Also, create a table with both income and consumption
+    tb_inc_2011, tb_cons_2011, tb_inc_or_cons_2011 = inc_or_cons_data(tb_2011)
+    tb_inc_2017, tb_cons_2017, tb_inc_or_cons_2017 = inc_or_cons_data(tb_2017)
+
+    # Create regional headcount variable, by patching missing values with the difference between world and regional headcount
+    tb_inc_or_cons_2017 = regional_headcount(tb_inc_or_cons_2017)
+
+    # Create survey count dataset, by counting the number of surveys available for each country in the past decade
+    tb_inc_or_cons_2017 = survey_count(tb_inc_or_cons_2017)
+
+    # Add metadata by code
+    tb_inc_2011 = add_metadata_vars(tb_garden=tb_inc_2011, ppp_version=2011, welfare_type="income")
+    tb_cons_2011 = add_metadata_vars(tb_garden=tb_cons_2011, ppp_version=2011, welfare_type="consumption")
+    tb_inc_or_cons_2011 = add_metadata_vars(
+        tb_garden=tb_inc_or_cons_2011,
+        ppp_version=2011,
+        welfare_type="income_consumption",
+    )
+
+    tb_inc_2017 = add_metadata_vars(tb_garden=tb_inc_2017, ppp_version=2017, welfare_type="income")
+    tb_cons_2017 = add_metadata_vars(tb_garden=tb_cons_2017, ppp_version=2017, welfare_type="consumption")
+    tb_inc_or_cons_2017 = add_metadata_vars(
+        tb_garden=tb_inc_or_cons_2017,
+        ppp_version=2017,
+        welfare_type="income_consumption",
+    )
+
+    tb_percentiles_2011 = add_metadata_vars_percentiles(
+        tb_garden=tb_percentiles_2011,
+        ppp_version=2011,
+        welfare_type="income_consumption",
+    )
+    tb_percentiles_2017 = add_metadata_vars_percentiles(
+        tb_garden=tb_percentiles_2017,
+        ppp_version=2017,
+        welfare_type="income_consumption",
+    )
+
+    # Set index and sort
+    # Define index cols
+    index_cols = ["country", "year"]
+    index_cols_percentiles = ["country", "year", "reporting_level", "welfare_type", "percentile"]
+    tb_inc_2011 = set_index_and_sort(tb=tb_inc_2011, index_cols=index_cols)
+    tb_cons_2011 = set_index_and_sort(tb=tb_cons_2011, index_cols=index_cols)
+    tb_inc_or_cons_2011 = set_index_and_sort(tb=tb_inc_or_cons_2011, index_cols=index_cols)
+
+    tb_inc_2017 = set_index_and_sort(tb=tb_inc_2017, index_cols=index_cols)
+    tb_cons_2017 = set_index_and_sort(tb=tb_cons_2017, index_cols=index_cols)
+    tb_inc_or_cons_2017 = set_index_and_sort(tb=tb_inc_or_cons_2017, index_cols=index_cols)
+
+    tb_percentiles_2011 = set_index_and_sort(tb=tb_percentiles_2011, index_cols=index_cols_percentiles)
+    tb_percentiles_2017 = set_index_and_sort(tb=tb_percentiles_2017, index_cols=index_cols_percentiles)
+
+    # Create spell tables to separate different survey spells in the explorers
+    spell_tables_inc = create_survey_spells(tb=tb_inc_2017)
+    spell_tables_cons = create_survey_spells(tb=tb_cons_2017)
+
+    # For income and consumption we combine the tables to not lose information from tb_inc_or_cons_2017
+    spell_tables_inc_or_cons = create_survey_spells_inc_cons(tb_inc=tb_inc_2017, tb_cons=tb_cons_2017)
+
+    # Drop columns not needed
+    tb_inc_2011 = drop_columns(tb_inc_2011)
+    tb_cons_2011 = drop_columns(tb_cons_2011)
+    tb_inc_or_cons_2011 = drop_columns(tb_inc_or_cons_2011)
+
+    tb_inc_2017 = drop_columns(tb_inc_2017)
+    tb_cons_2017 = drop_columns(tb_cons_2017)
+    tb_inc_or_cons_2017 = drop_columns(tb_inc_or_cons_2017)
+
+    # Merge tables for PPP comparison explorer
+    tb_inc_2011_2017 = combine_tables_2011_2017(tb_2011=tb_inc_2011, tb_2017=tb_inc_2017, short_name="income_2011_2017")
+    tb_cons_2011_2017 = combine_tables_2011_2017(
+        tb_2011=tb_cons_2011, tb_2017=tb_cons_2017, short_name="consumption_2011_2017"
+    )
+    tb_inc_or_cons_2011_2017 = combine_tables_2011_2017(
+        tb_2011=tb_inc_or_cons_2011, tb_2017=tb_inc_or_cons_2017, short_name="income_consumption_2011_2017"
+    )
+
+    # Define tables to upload
+    # The ones we need in Grapher admin would be tb_inc_or_cons_2011, tb_inc_or_cons_2017
+    tables = (
+        [
+            tb_inc_2011,
+            tb_cons_2011,
+            tb_inc_or_cons_2011,
+            tb_inc_2017,
+            tb_cons_2017,
+            tb_inc_or_cons_2017,
+            tb_inc_2011_2017,
+            tb_cons_2011_2017,
+            tb_inc_or_cons_2011_2017,
+            tb_percentiles_2011,
+            tb_percentiles_2017,
+        ]
+        + spell_tables_inc
+        + spell_tables_cons
+        + spell_tables_inc_or_cons
+    )
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir,
+        tables=tables,
+        check_variables_metadata=True,
+        default_metadata=ds_meadow.metadata,
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
+
+
+def process_data(tb: Table) -> Table:
+    # rename columns
+    tb = tb.rename(columns={"headcount": "headcount_ratio", "poverty_gap": "poverty_gap_index"})
+
+    # Changing the decile(i) variables for decile(i)_share
+    for i in range(1, 11):
+        tb = tb.rename(columns={f"decile{i}": f"decile{i}_share"})
+
+    # Calculate number in poverty
+    tb["headcount"] = tb["headcount_ratio"] * tb["reporting_pop"]
+    tb["headcount"] = tb["headcount"].round(0)
+
+    # Calculate shortfall of incomes
+    tb["total_shortfall"] = tb["poverty_gap_index"] * tb["poverty_line"] * tb["reporting_pop"]
+
+    # Calculate average shortfall of incomes (averaged across population in poverty)
+    tb["avg_shortfall"] = tb["total_shortfall"] / tb["headcount"]
+
+    # Calculate income gap ratio (according to Ravallion's definition)
+    tb["income_gap_ratio"] = (tb["total_shortfall"] / tb["headcount"]) / tb["poverty_line"]
+
+    # Same for relative poverty
+    for pct in [40, 50, 60]:
+        tb[f"headcount_{pct}_median"] = tb[f"headcount_ratio_{pct}_median"] * tb["reporting_pop"]
+        tb[f"headcount_{pct}_median"] = tb[f"headcount_{pct}_median"].round(0)
+        tb[f"total_shortfall_{pct}_median"] = (
+            tb[f"poverty_gap_index_{pct}_median"] * tb["median"] * pct / 100 * tb["reporting_pop"]
+        )
+        tb[f"avg_shortfall_{pct}_median"] = tb[f"total_shortfall_{pct}_median"] / tb[f"headcount_{pct}_median"]
+        tb[f"income_gap_ratio_{pct}_median"] = (tb[f"total_shortfall_{pct}_median"] / tb[f"headcount_{pct}_median"]) / (
+            tb["median"] * pct / 100
+        )
+
+    # Shares to percentages
+    # executing the function over list of vars
+    pct_indicators = [
+        "headcount_ratio",
+        "income_gap_ratio",
+        "poverty_gap_index",
+        "headcount_ratio_40_median",
+        "headcount_ratio_50_median",
+        "headcount_ratio_60_median",
+        "income_gap_ratio_40_median",
+        "income_gap_ratio_50_median",
+        "income_gap_ratio_60_median",
+        "poverty_gap_index_40_median",
+        "poverty_gap_index_50_median",
+        "poverty_gap_index_60_median",
+    ]
+    tb.loc[:, pct_indicators] = tb[pct_indicators] * 100
+
+    # Create a new column for the poverty line in cents and string
+    tb["poverty_line_cents"] = round(tb["poverty_line"] * 100).astype(int).astype(str)
+
+    # Make the table wide, with poverty_line_cents as columns
+    tb = tb.pivot(
+        index=[
+            "ppp_version",
+            "country",
+            "year",
+            "reporting_level",
+            "welfare_type",
+            "survey_comparability",
+            "comparable_spell",
+            "reporting_pop",
+            "mean",
+            "median",
+            "mld",
+            "gini",
+            "polarization",
+            "decile1_share",
+            "decile2_share",
+            "decile3_share",
+            "decile4_share",
+            "decile5_share",
+            "decile6_share",
+            "decile7_share",
+            "decile8_share",
+            "decile9_share",
+            "decile10_share",
+            "decile1_thr",
+            "decile2_thr",
+            "decile3_thr",
+            "decile4_thr",
+            "decile5_thr",
+            "decile6_thr",
+            "decile7_thr",
+            "decile8_thr",
+            "decile9_thr",
+            "is_interpolated",
+            "distribution_type",
+            "estimation_type",
+            "headcount_40_median",
+            "headcount_50_median",
+            "headcount_60_median",
+            "headcount_ratio_40_median",
+            "headcount_ratio_50_median",
+            "headcount_ratio_60_median",
+            "income_gap_ratio_40_median",
+            "income_gap_ratio_50_median",
+            "income_gap_ratio_60_median",
+            "poverty_gap_index_40_median",
+            "poverty_gap_index_50_median",
+            "poverty_gap_index_60_median",
+            "avg_shortfall_40_median",
+            "avg_shortfall_50_median",
+            "avg_shortfall_60_median",
+            "total_shortfall_40_median",
+            "total_shortfall_50_median",
+            "total_shortfall_60_median",
+            "poverty_severity_40_median",
+            "poverty_severity_50_median",
+            "poverty_severity_60_median",
+            "watts_40_median",
+            "watts_50_median",
+            "watts_60_median",
+        ],
+        columns="poverty_line_cents",
+        values=[
+            "headcount",
+            "headcount_ratio",
+            "income_gap_ratio",
+            "poverty_gap_index",
+            "avg_shortfall",
+            "total_shortfall",
+            "poverty_severity",
+            "watts",
+        ],
+    )
+
+    # Flatten column names
+    tb.columns = ["_".join(col).strip() for col in tb.columns.values]
+
+    # Reset index
+    tb = tb.reset_index()
+
+    return tb
+
+
+def create_stacked_variables(tb: Table, povlines_dict: dict, ppp_version: int) -> Tuple[Table, list, list]:
+    """
+    Create stacked variables from the indicators to plot them as stacked area/bar charts
+    """
+    # Select poverty lines between 2011 and 2017 and sort in case they are not in order
+    povlines = povlines_dict[ppp_version]
+    povlines.sort()
+
+    # Above variables
+
+    col_above_n = []
+    col_above_pct = []
+
+    for p in povlines:
+        varname_n = f"headcount_above_{p}"
+        varname_pct = f"headcount_ratio_above_{p}"
+
+        tb[varname_n] = tb["reporting_pop"] - tb[f"headcount_{p}"]
+        tb[varname_pct] = tb[varname_n] / tb["reporting_pop"]
+
+        col_above_n.append(varname_n)
+        col_above_pct.append(varname_pct)
+
+    tb.loc[:, col_above_pct] = tb[col_above_pct] * 100
+
+    # Stacked variables
+
+    col_stacked_n = []
+    col_stacked_pct = []
+
+    for i in range(len(povlines)):
+        # if it's the first value only continue
+        if i == 0:
+            continue
+
+        # If it's the last value calculate the people between this value and the previous
+        # and also the people over this poverty line (and percentages)
+        elif i == len(povlines) - 1:
+            varname_n = f"headcount_between_{povlines[i-1]}_{povlines[i]}"
+            varname_pct = f"headcount_ratio_between_{povlines[i-1]}_{povlines[i]}"
+            tb[varname_n] = tb[f"headcount_{povlines[i]}"] - tb[f"headcount_{povlines[i-1]}"]
+            tb[varname_pct] = tb[varname_n] / tb["reporting_pop"]
+            col_stacked_n.append(varname_n)
+            col_stacked_pct.append(varname_pct)
+            varname_n = f"headcount_above_{povlines[i]}"
+            varname_pct = f"headcount_ratio_above_{povlines[i]}"
+            tb[varname_n] = tb["reporting_pop"] - tb[f"headcount_{povlines[i]}"]
+            tb[varname_pct] = tb[varname_n] / tb["reporting_pop"]
+            col_stacked_n.append(varname_n)
+            col_stacked_pct.append(varname_pct)
+
+        # If it's any value between the first and the last calculate the people between this value and the previous (and percentage)
+        else:
+            varname_n = f"headcount_between_{povlines[i-1]}_{povlines[i]}"
+            varname_pct = f"headcount_ratio_between_{povlines[i-1]}_{povlines[i]}"
+            tb[varname_n] = tb[f"headcount_{povlines[i]}"] - tb[f"headcount_{povlines[i-1]}"]
+            tb[varname_pct] = tb[varname_n] / tb["reporting_pop"]
+            col_stacked_n.append(varname_n)
+            col_stacked_pct.append(varname_pct)
+
+    tb.loc[:, col_stacked_pct] = tb[col_stacked_pct] * 100
+
+    # Add variables below first poverty line to the stacked variables
+    col_stacked_n.append(f"headcount_{povlines[0]}")
+    col_stacked_pct.append(f"headcount_ratio_{povlines[0]}")
+
+    # Calculate stacked variables which "jump" the original order
+
+    tb[f"headcount_between_{povlines[1]}_{povlines[4]}"] = (
+        tb[f"headcount_{povlines[4]}"] - tb[f"headcount_{povlines[1]}"]
+    )
+    tb[f"headcount_between_{povlines[4]}_{povlines[6]}"] = (
+        tb[f"headcount_{povlines[6]}"] - tb[f"headcount_{povlines[4]}"]
+    )
+
+    tb[f"headcount_ratio_between_{povlines[1]}_{povlines[4]}"] = (
+        tb[f"headcount_ratio_{povlines[4]}"] - tb[f"headcount_ratio_{povlines[1]}"]
+    )
+    tb[f"headcount_ratio_between_{povlines[4]}_{povlines[6]}"] = (
+        tb[f"headcount_ratio_{povlines[6]}"] - tb[f"headcount_ratio_{povlines[4]}"]
+    )
+
+    return tb, col_stacked_n, col_stacked_pct
+
+
+def calculate_inequality(tb: Table) -> Table:
+    """
+    Calculate inequality measures: decile averages and ratios
+    """
+
+    col_decile_share = []
+    col_decile_avg = []
+    col_decile_thr = []
+
+    for i in range(1, 11):
+        if i != 10:
+            varname_thr = f"decile{i}_thr"
+            col_decile_thr.append(varname_thr)
+
+        varname_share = f"decile{i}_share"
+        varname_avg = f"decile{i}_avg"
+        tb[varname_avg] = tb[varname_share] * tb["mean"] / 0.1
+
+        col_decile_share.append(varname_share)
+        col_decile_avg.append(varname_avg)
+
+    # Multiplies decile columns by 100
+    tb.loc[:, col_decile_share] = tb[col_decile_share] * 100
+
+    # Create bottom 50 and middle 40% shares
+    tb["bottom50_share"] = (
+        tb["decile1_share"] + tb["decile2_share"] + tb["decile3_share"] + tb["decile4_share"] + tb["decile5_share"]
+    )
+    tb["middle40_share"] = tb["decile6_share"] + tb["decile7_share"] + tb["decile8_share"] + tb["decile9_share"]
+
+    # Palma ratio and other average/share ratios
+    tb["palma_ratio"] = tb["decile10_share"] / (
+        tb["decile1_share"] + tb["decile2_share"] + tb["decile3_share"] + tb["decile4_share"]
+    )
+    tb["s80_s20_ratio"] = (tb["decile9_share"] + tb["decile10_share"]) / (tb["decile1_share"] + tb["decile2_share"])
+    tb["p90_p10_ratio"] = tb["decile9_thr"] / tb["decile1_thr"]
+    tb["p90_p50_ratio"] = tb["decile9_thr"] / tb["decile5_thr"]
+    tb["p50_p10_ratio"] = tb["decile5_thr"] / tb["decile1_thr"]
+
+    # Replace infinite values with nulls
+    tb = tb.replace([np.inf, -np.inf], np.nan)
+    return tb
+
+
+def identify_rural_urban(tb: Table) -> Table:
+    """
+    Amend the entity to reflect if data refers to urban or rural only
+    """
+
+    # Make country and reporting_level columns into strings
+    tb["country"] = tb["country"].astype(str)
+    tb["reporting_level"] = tb["reporting_level"].astype(str)
+    ix = tb["reporting_level"].isin(["urban", "rural"])
+    tb.loc[(ix), "country"] = tb.loc[(ix), "country"] + " (" + tb.loc[(ix), "reporting_level"] + ")"
+
+    return tb
+
+
+def sanity_checks(
+    tb: Table, povlines_dict: dict, ppp_version: int, col_stacked_n: list, col_stacked_pct: list
+) -> Table:
+    """
+    Sanity checks for the table
+    """
+
+    # Select poverty lines between 2011 and 2017 and sort in case they are not in order
+    povlines = povlines_dict[ppp_version]
+    povlines.sort()
+
+    # Save the number of observations before the checks
+    obs_before_checks = len(tb)
+
+    # Create lists of variables to check
+    col_headcount = []
+    col_headcount_ratio = []
+    col_povertygap = []
+    col_tot_shortfall = []
+    col_watts = []
+    col_poverty_severity = []
+    col_decile_share = []
+    col_decile_thr = []
+
+    for p in povlines:
+        col_headcount.append(f"headcount_{p}")
+        col_headcount_ratio.append(f"headcount_ratio_{p}")
+        col_povertygap.append(f"poverty_gap_index_{p}")
+        col_tot_shortfall.append(f"total_shortfall_{p}")
+        col_watts.append(f"watts_{p}")
+        col_poverty_severity.append(f"poverty_severity_{p}")
+
+    for i in range(1, 11):
+        col_decile_share.append(f"decile{i}_share")
+        if i != 10:
+            col_decile_thr.append(f"decile{i}_thr")
+
+    ############################
+    # Negative values
+    mask = (
+        tb[
+            col_headcount
+            + col_headcount_ratio
+            + col_povertygap
+            + col_tot_shortfall
+            + col_watts
+            + col_poverty_severity
+            + col_decile_share
+            + col_decile_thr
+            + ["mean", "median", "mld", "gini", "polarization"]
+        ]
+        .lt(0)
+        .any(axis=1)
+    )
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.fatal(
+            f"""There are {len(tb_error)} observations with negative values! In
+            {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type']], headers = 'keys', tablefmt = TABLEFMT)}"""
+        )
+        # NOTE: Check if we want to delete these observations
+        # tb = tb[~mask].reset_index(drop=True)
+
+    ############################
+    # stacked values not adding up to 100%
+    tb["sum_pct"] = tb[col_stacked_pct].sum(axis=1)
+    mask = (tb["sum_pct"] >= 100.1) | (tb["sum_pct"] <= 99.9)
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.warning(
+            f"""{len(tb_error)} observations of stacked values are not adding up to 100% and will be deleted:
+            {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type', 'sum_pct']], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}"""
+        )
+        tb = tb[~mask].reset_index(drop=True).copy()
+
+    ############################
+    # missing poverty values (headcount, poverty gap, total shortfall)
+    cols_to_check = (
+        col_headcount + col_headcount_ratio + col_povertygap + col_tot_shortfall + col_stacked_n + col_stacked_pct
+    )
+    mask = tb[cols_to_check].isna().any(axis=1)
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.warning(
+            f"""There are {len(tb_error)} observations with missing poverty values and will be deleted:
+            {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_headcount], headers = 'keys', tablefmt = TABLEFMT)}"""
+        )
+        tb = tb[~mask].reset_index(drop=True)
+
+    ############################
+    # Missing median
+    mask = tb["median"].isna()
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.info(f"""There are {len(tb_error)} observations with missing median. They will be not deleted.""")
+
+    ############################
+    # Missing mean
+    mask = tb["mean"].isna()
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.info(f"""There are {len(tb_error)} observations with missing mean. They will be not deleted.""")
+
+    ############################
+    # Missing gini
+    mask = tb["gini"].isna()
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.info(f"""There are {len(tb_error)} observations with missing gini. They will be not deleted.""")
+
+    ############################
+    # Missing decile shares
+    mask = tb[col_decile_share].isna().any(axis=1)
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.info(f"""There are {len(tb_error)} observations with missing decile shares. They will be not deleted.""")
+
+    ############################
+    # Missing decile thresholds
+    mask = tb[col_decile_thr].isna().any(axis=1)
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.info(
+            f"""There are {len(tb_error)} observations with missing decile thresholds. They will be not deleted."""
+        )
+
+    ############################
+    # headcount monotonicity check
+    m_check_vars = []
+    for i in range(len(col_headcount)):
+        if i > 0:
+            check_varname = f"m_check_{i}"
+            tb[check_varname] = tb[f"{col_headcount[i]}"] >= tb[f"{col_headcount[i-1]}"]
+            m_check_vars.append(check_varname)
+    tb["check_total"] = tb[m_check_vars].all(axis=1)
+
+    tb_error = tb[~tb["check_total"]].reset_index(drop=True)
+
+    if not tb_error.empty:
+        log.warning(
+            f"""There are {len(tb_error)} observations with headcount not monotonically increasing and will be deleted:
+            {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_headcount], headers = 'keys', tablefmt = TABLEFMT, floatfmt="0.0f")}"""
+        )
+        tb = tb[tb["check_total"]].reset_index(drop=True)
+
+    ############################
+    # Threshold monotonicity check
+    m_check_vars = []
+    for i in range(1, 10):
+        if i > 1:
+            check_varname = f"m_check_{i}"
+            tb[check_varname] = tb[f"decile{i}_thr"] >= tb[f"decile{i-1}_thr"]
+            m_check_vars.append(check_varname)
+
+    tb["check_total"] = tb[m_check_vars].all(axis=1)
+
+    # Drop rows if columns in col_decile_thr are all null. Keep if some are null
+    mask = (~tb["check_total"]) & (tb[col_decile_thr].notnull().any(axis=1))
+
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.warning(
+            f"""There are {len(tb_error)} observations with thresholds not monotonically increasing and will be deleted:
+            {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type']], headers = 'keys', tablefmt = TABLEFMT)}"""
+        )
+        tb = tb[~mask].reset_index(drop=True)
+
+    ############################
+    # Shares monotonicity check
+    m_check_vars = []
+    for i in range(1, 11):
+        if i > 1:
+            check_varname = f"m_check_{i}"
+            tb[check_varname] = tb[f"decile{i}_share"] >= tb[f"decile{i-1}_share"]
+            m_check_vars.append(check_varname)
+
+    tb["check_total"] = tb[m_check_vars].all(axis=1)
+
+    # Drop rows if columns in col_decile_share are all null. Keep if some are null
+    mask = (~tb["check_total"]) & (tb[col_decile_share].notnull().any(axis=1))
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.warning(
+            f"""There are {len(tb_error)} observations with shares not monotonically increasing and will be deleted:
+            {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_decile_share], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}"""
+        )
+        tb = tb[~mask].reset_index(drop=True)
+
+    ############################
+    # Shares not adding up to 100%
+
+    tb["sum_pct"] = tb[col_decile_share].sum(axis=1)
+
+    # Drop rows if columns in col_decile_share are all null. Keep if some are null
+    mask = (tb["sum_pct"] >= 100.1) | (tb["sum_pct"] <= 99.9) & (tb[col_decile_share].notnull().any(axis=1))
+    tb_error = tb[mask].reset_index(drop=True).copy()
+
+    if not tb_error.empty:
+        log.warning(
+            f"""{len(tb_error)} observations of shares are not adding up to 100% and will be deleted:
+            {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type', 'sum_pct']], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}"""
+        )
+        tb = tb[~mask].reset_index(drop=True)
+
+    ############################
+    # delete columns created for the checks
+    tb = tb.drop(columns=m_check_vars + ["m_check_1", "check_total", "sum_pct"])
+
+    obs_after_checks = len(tb)
+    log.info(f"Sanity checks deleted {obs_before_checks - obs_after_checks} observations for {ppp_version} PPPs.")
+
+    return tb
+
+
+def separate_ppp_data(tb: Table) -> Tuple[Table, Table]:
+    """
+    Separate out ppp data from the main dataset
+    """
+
+    # Filter table to include only the right ppp_version
+    # Also, drop columns with all NaNs (which are the ones that are not relevant for the ppp_version)
+    tb_2011 = tb[tb["ppp_version"] == 2011].dropna(axis=1, how="all").reset_index(drop=True).copy()
+    tb_2017 = tb[tb["ppp_version"] == 2017].dropna(axis=1, how="all").reset_index(drop=True).copy()
+
+    return tb_2011, tb_2017
+
+
+def inc_or_cons_data(tb: Table) -> Tuple[Table, Table, Table]:
+    """
+    Separate income and consumption data
+    """
+
+    # Separate out consumption-only, income-only. Also, create a table with both income and consumption
+    tb_inc = tb[tb["welfare_type"] == "income"].reset_index(drop=True).copy()
+    tb_cons = tb[tb["welfare_type"] == "consumption"].reset_index(drop=True).copy()
+    tb_inc_or_cons = tb.copy()
+
+    # If both inc and cons are available in a given year, drop inc
+
+    # Flag duplicates – indicating multiple welfare_types
+    # Sort values to ensure the welfare_type consumption is marked as False when there are multiple welfare types
+    tb_inc_or_cons = tb_inc_or_cons.sort_values(
+        by=["ppp_version", "country", "year", "reporting_level", "welfare_type"], ignore_index=True
+    )
+    tb_inc_or_cons["duplicate_flag"] = tb_inc_or_cons.duplicated(
+        subset=["ppp_version", "country", "year", "reporting_level"]
+    )
+
+    # Drop income where income and consumption are available
+    tb_inc_or_cons = tb_inc_or_cons[
+        (~tb_inc_or_cons["duplicate_flag"]) | (tb_inc_or_cons["welfare_type"] == "consumption")
+    ]
+    tb_inc_or_cons.drop(columns=["duplicate_flag"], inplace=True)
+
+    tb_inc_or_cons = check_jumps_in_grapher_dataset(tb_inc_or_cons)
+
+    tb_inc_or_cons = remove_confusing_datapoints_in_grapher_dataset(tb_inc_or_cons)
+
+    return tb_inc, tb_cons, tb_inc_or_cons
+
+
+def regional_headcount(tb: Table) -> Table:
+    """
+    Create regional headcount dataset, by patching missing values with the difference between world and regional headcount
+    """
+
+    # Keep only regional data: for regions, these are the reporting_level rows not in ['national', 'urban', 'rural']
+    tb_regions = tb[~tb["reporting_level"].isin(["national", "urban", "rural"])].reset_index(drop=True).copy()
+
+    # Remove Western and Central and Eastern and Southern Africa. It's redundant with Sub-Saharan Africa (PIP)
+    tb_regions = tb_regions[
+        ~tb_regions["country"].isin(["Western and Central Africa (PIP)", "Eastern and Southern Africa (PIP)"])
+    ].reset_index(drop=True)
+
+    # Select needed columns and pivot
+    tb_regions = tb_regions[["country", "year", "headcount_215"]]
+    tb_regions = tb_regions.pivot(index="year", columns="country", values="headcount_215")
+
+    # Drop rows with more than one region with null headcount
+    tb_regions["check_total"] = tb_regions[tb_regions.columns].isnull().sum(axis=1)
+    mask = tb_regions["check_total"] > 1
+
+    tb_out = tb_regions[mask].reset_index()
+    if len(tb_out) > 0:
+        log.info(
+            f"""There are {len(tb_out)} years with more than one null region value and will be deleted:
+            {list(tb_out.year.unique())}"""
+        )
+        tb_regions = tb_regions[~mask].reset_index()
+        tb_regions = tb_regions.drop(columns="check_total")
+
+    # Get difference between world and (total) regional headcount, to patch rows with one missing value
+    cols_to_sum = [e for e in list(tb_regions.columns) if e not in ["year", "World"]]
+    tb_regions["sum_regions"] = tb_regions[cols_to_sum].sum(axis=1)
+
+    tb_regions["diff_world_regions"] = tb_regions["World"] - tb_regions["sum_regions"]
+
+    # Fill null values with the difference and drop aux variables
+    col_dictionary = dict.fromkeys(cols_to_sum, tb_regions["diff_world_regions"])
+    tb_regions.loc[:, cols_to_sum] = tb_regions[cols_to_sum].fillna(col_dictionary)
+    tb_regions = tb_regions.drop(columns=["World", "sum_regions", "diff_world_regions"])
+
+    # NOTE: I am not extracting data for China and India at least for now, because we are only extracting non filled data
+    # The data originally came from filled data to plot properly.
+
+    # # Get headcount values for China and India
+    # df_chn_ind = tb[(tb["country"].isin(["China", "India"])) & (tb["reporting_level"] == "national")].reset_index(
+    #     drop=True
+    # )
+    # df_chn_ind = df_chn_ind[["country", "year", "headcount_215"]]
+
+    # # Make table wide and merge with regional data
+    # df_chn_ind = df_chn_ind.pivot(index="year", columns="country", values="headcount_215").reset_index()
+    # tb_regions = pr.merge(tb_regions, df_chn_ind, on="year", how="left")
+
+    # tb_regions["East Asia and Pacific excluding China"] = (
+    #     tb_regions["East Asia and Pacific (PIP)"] - tb_regions["China"]
+    # )
+    # tb_regions["South Asia excluding India"] = tb_regions["South Asia (PIP)"] - tb_regions["India"]
+
+    tb_regions = pr.melt(tb_regions, id_vars=["year"], var_name="country", value_name="headcount_215")
+    tb_regions = tb_regions[["country", "year", "headcount_215"]]
+
+    # Rename headcount_215 to headcount_215_region, to distinguish it from the original headcount_215 when merging
+    tb_regions = tb_regions.rename(columns={"headcount_215": "headcount_215_regions"})
+
+    # Merge with original table
+    tb = pr.merge(tb, tb_regions, on=["country", "year"], how="outer")
+
+    return tb
+
+
+def survey_count(tb: Table) -> Table:
+    """
+    Create survey count indicator, by counting the number of surveys available for each country in the past decade
+    """
+    # Remove regions from the table
+    tb_survey = tb[~tb["reporting_level"].isnull()].reset_index(drop=True).copy()
+
+    min_year = int(tb_survey["year"].min())
+    max_year = int(tb_survey["year"].max())
+    year_list = list(range(min_year, max_year + 1))
+    country_list = list(tb_survey["country"].unique())
+
+    # Create two tables with all the years and entities
+    year_tb_survey = Table(year_list)
+    entity_tb_survey = Table(country_list)
+
+    # Make a cartesian product of both dataframes: join all the combinations between all the entities and all the years
+    cross = pr.merge(entity_tb_survey, year_tb_survey, how="cross")
+    cross = cross.rename(columns={"0_x": "country", "0_y": "year"})
+
+    # Merge cross and df_country, to include all the possible rows in the dataset
+    tb_survey = pr.merge(cross, tb_survey[["country", "year"]], on=["country", "year"], how="left", indicator=True)
+
+    # Mark with 1 if there are surveys available, 0 if not (this is done by checking if the row is in both datasets)
+    tb_survey["survey_available"] = 0
+    tb_survey.loc[tb_survey["_merge"] == "both", "survey_available"] = 1
+
+    # Sum for each entity the surveys available for the previous 9 years and the current year
+    tb_survey["surveys_past_decade"] = (
+        tb_survey["survey_available"]
+        .groupby(tb_survey["country"], sort=False)
+        .rolling(min_periods=1, window=10)
+        .sum()
+        .values
+    )
+
+    # Copy metadata
+    tb_survey["surveys_past_decade"] = tb_survey["surveys_past_decade"].copy_metadata(tb["reporting_level"])
+
+    # Keep columns needed
+    tb_survey = tb_survey[["country", "year", "surveys_past_decade"]]
+
+    # Merge with original table
+    tb = pr.merge(tb_survey, tb, on=["country", "year"], how="left")
+
+    return tb
+
+
+def set_index_and_sort(tb: Table, index_cols: list) -> Table:
+    """
+    Add index and sort
+    """
+
+    tb = tb.set_index(index_cols, verify_integrity=True).sort_index()
+
+    return tb
+
+
+def drop_columns(tb: Table) -> Table:
+    """
+    Drop columns not needed
+    """
+
+    # Remove columns
+    tb = tb.drop(
+        columns=[
+            "ppp_version",
+            "reporting_pop",
+            "is_interpolated",
+            "distribution_type",
+            "estimation_type",
+            "survey_comparability",
+            "comparable_spell",
+        ]
+    )
+
+    return tb
+
+
+def create_survey_spells(tb: Table) -> list:
+    """
+    Create tables for each indicator and survey spells, to be able to graph them in explorers.
+    """
+
+    tb = tb.copy()
+
+    # drop rows where survey coverage = nan (This is just regions)
+    tb = tb[tb["survey_comparability"].notna()].reset_index()
+
+    # Add 1 to make comparability var run from 1, not from 0
+    tb["survey_comparability"] += 1
+
+    # Note the welfare type in the comparability spell
+    tb["survey_comparability"] = (
+        tb["welfare_type"].astype(str) + "_spell_" + tb["survey_comparability"].astype(int).astype(str)
+    )
+
+    # Remove columns not needed: stacked, above, etc
+    drop_list = ["above", "between", "poverty_severity", "watts"]
+    for var in drop_list:
+        tb = tb[tb.columns.drop(list(tb.filter(like=var)))]
+
+    vars = [
+        i
+        for i in tb.columns
+        if i
+        not in [
+            "country",
+            "year",
+            "ppp_version",
+            "reporting_level",
+            "welfare_type",
+            "reporting_pop",
+            "is_interpolated",
+            "distribution_type",
+            "estimation_type",
+            "survey_comparability",
+            "comparable_spell",
+            "headcount_215_regions",
+            "surveys_past_decade",
+        ]
+    ]
+
+    # Define spell table list
+    spell_tables = []
+
+    # Loop over the variables in the main dataset
+    for select_var in vars:
+        tb_var = tb[["country", "year", select_var, "survey_comparability"]].copy()
+
+        # convert to wide
+        tb_var = pr.pivot(
+            tb_var,
+            index=["country", "year"],
+            columns=["survey_comparability"],
+            values=select_var,
+        )
+
+        tb_var.metadata.short_name = f"{tb_var.metadata.short_name}_{select_var}"
+
+        spell_tables.append(tb_var)
+
+    return spell_tables
+
+
+def create_survey_spells_inc_cons(tb_inc: Table, tb_cons: Table) -> list:
+    """
+    Create table for each indicator and survey spells, to be able to graph them in explorers.
+    This version recombines income and consumption tables to not lose dropped rows.
+    """
+
+    tb_inc = tb_inc.reset_index().copy()
+    tb_cons = tb_cons.reset_index().copy()
+
+    # Concatenate the two tables
+    tb_inc_or_cons_2017_spells = pr.concat([tb_inc, tb_cons], ignore_index=True, short_name="income_consumption_2017")
+
+    # Set index and sort
+    tb_inc_or_cons_2017_spells = set_index_and_sort(
+        tb=tb_inc_or_cons_2017_spells, index_cols=["country", "year", "reporting_level", "welfare_type"]
+    )
+
+    # Create spells
+    spell_tables = create_survey_spells(tb_inc_or_cons_2017_spells)
+
+    return spell_tables
+
+
+def combine_tables_2011_2017(tb_2011: Table, tb_2017: Table, short_name: str) -> Table:
+    """
+    Combine income and consumption tables from 2011 and 2017 PPPs.
+    We will use this table for the Poverty Data Explorer: World Bank data - 2011 vs. 2017 prices.
+    """
+
+    # Identify columns to use (ID + indicators)
+    id_cols = ["country", "year"]
+
+    tb_2011 = define_columns_for_ppp_comparison(tb=tb_2011, id_cols=id_cols, ppp_version=2011)
+    tb_2017 = define_columns_for_ppp_comparison(tb=tb_2017, id_cols=id_cols, ppp_version=2017)
+
+    # Rename all the non-id columns with the suffix _ppp(year)
+    # (the suffix option in merge only adds suffix when columns coincide)
+    tb_2011 = tb_2011.rename(columns={c: c + "_ppp2011" for c in tb_2011.columns if c not in id_cols})
+    tb_2017 = tb_2017.rename(columns={c: c + "_ppp2017" for c in tb_2017.columns if c not in id_cols})
+
+    # Merge the two files (it's OK to have an inneer join, because we want to keep country-year pairs that are in both files)
+    tb_2011_2017 = pr.merge(tb_2011, tb_2017, on=id_cols, validate="one_to_one", short_name=short_name)
+
+    # Add index and sort
+    tb_2011_2017 = tb_2011_2017.set_index(["country", "year"], verify_integrity=True).sort_index()
+
+    return tb_2011_2017
+
+
+def define_columns_for_ppp_comparison(tb: Table, id_cols: list, ppp_version: int) -> Table:
+    """
+    Define columns to use for the comparison of 2011 and 2017 PPPs
+    """
+
+    tb = tb.reset_index()
+    # Define poverty lines
+    povlines_list = povlines_dict[ppp_version]
+
+    # Define groups of columns
+    headcount_absolute_cols = [f"headcount_{p}" for p in povlines_list]
+    headcount_ratio_absolute_cols = [f"headcount_ratio_{p}" for p in povlines_list]
+
+    headcount_relative_cols = [f"headcount_{rel}_median" for rel in [40, 50, 60]]
+    headcount_ratio_relative_cols = [f"headcount_ratio_{rel}_median" for rel in [40, 50, 60]]
+
+    # Define all the columns to filter
+
+    cols_list = (
+        id_cols
+        + headcount_absolute_cols
+        + headcount_ratio_absolute_cols
+        + headcount_relative_cols
+        + headcount_ratio_relative_cols
+        + ["mean", "median", "decile1_thr", "decile9_thr"]
+    )
+
+    # Filter columns
+    tb = tb[cols_list]
+
+    return tb
+
+
+def regional_data_from_1990(tb: Table, regions_list: list) -> Table:
+    """
+    Select regional data only from 1990 onwards, due to the uncertainty in 1980s data
+    """
+    # Create a regions table
+    tb_regions = tb[(tb["year"] >= 1990) & (tb["country"].isin(regions_list))].reset_index(drop=True).copy()
+
+    # Remove regions from tb
+    tb = tb[~tb["country"].isin(regions_list)].reset_index(drop=True).copy()
+
+    # Concatenate both tables
+    tb = pr.concat([tb, tb_regions], ignore_index=True)
+    return tb
+
+
+def check_jumps_in_grapher_dataset(tb: Table) -> Table:
+    """
+    Check for jumps in the dataset, which can be caused by combining income and consumption estimates for one country series.
+    """
+    # For each country, year, welfare_type and reporting_level, check if the difference between the columns is too high
+
+    # Define columns to check: all the headcount ratio columns
+    cols_to_check = [
+        col for col in tb.columns if "headcount_ratio" in col and "above" not in col and "between" not in col
+    ]
+
+    for col in cols_to_check:
+        # Create a new column, shift_col, that is the same as col but shifted one row down for each country, year, welfare_type and reporting_level
+        tb["shift_col"] = tb.groupby(["country", "reporting_level"])[col].shift(1)
+
+        # Create shift_year column
+        tb["shift_year"] = tb.groupby(["country", "reporting_level"])["year"].shift(1)
+
+        # Create shift_welfare_type column
+        tb["shift_welfare_type"] = tb.groupby(["country", "reporting_level"])["welfare_type"].shift(1)
+
+        # Calculate the difference between col and shift_col
+        tb["check_diff_column"] = tb[col] - tb["shift_col"]
+
+        # Calculate the difference between years
+        tb["check_diff_year"] = tb["year"] - tb["shift_year"]
+
+        # Calculate if the welfare type is the same
+        tb["check_diff_welfare_type"] = tb["welfare_type"] == tb["shift_welfare_type"]
+
+        # Check if the difference is too high
+        mask = (abs(tb["check_diff_column"]) > 20) & (tb["check_diff_year"] <= 5) & ~tb["check_diff_welfare_type"]
+        tb_error = tb[mask].reset_index(drop=True).copy()
+
+        if not tb_error.empty:
+            log.warning(
+                f"""There are {len(tb_error)} observations with abnormal jumps for {col}:
+                {tabulate(tb_error[['ppp_version', 'country', 'year', 'reporting_level', col, 'check_diff_column', 'check_diff_year']].sort_values('year').reset_index(drop=True), headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}"""
+            )
+            # tb = tb[~mask].reset_index(drop=True)
+
+    # Drop the columns created for the check
+    tb = tb.drop(
+        columns=[
+            "shift_col",
+            "shift_year",
+            "shift_welfare_type",
+            "check_diff_column",
+            "check_diff_year",
+            "check_diff_welfare_type",
+        ]
+    )
+
+    return tb
+
+
+def remove_confusing_datapoints_in_grapher_dataset(tb: Table) -> Table:
+    """
+    Remove datapoints that are confusing when we are showing a unique series for both income and consumption.
+    """
+
+    # Define columns to keep data. Inequality is mostly not affected by the income/consumption choice
+    cols_to_keep = [
+        "country",
+        "year",
+        "reporting_level",
+        "welfare_type",
+        "gini",
+        "mld",
+        "decile1_share",
+        "decile2_share",
+        "decile3_share",
+        "decile4_share",
+        "decile5_share",
+        "decile6_share",
+        "decile7_share",
+        "decile8_share",
+        "decile9_share",
+        "decile10_share",
+        "bottom50_share",
+        "middle40_share",
+        "headcount_40_median",
+        "headcount_50_median",
+        "headcount_60_median",
+        "headcount_ratio_40_median",
+        "headcount_ratio_50_median",
+        "headcount_ratio_60_median",
+        "income_gap_ratio_40_median",
+        "income_gap_ratio_50_median",
+        "income_gap_ratio_60_median",
+        "poverty_gap_index_40_median",
+        "poverty_gap_index_50_median",
+        "poverty_gap_index_60_median",
+        "avg_shortfall_40_median",
+        "avg_shortfall_50_median",
+        "avg_shortfall_60_median",
+        "total_shortfall_40_median",
+        "total_shortfall_50_median",
+        "total_shortfall_60_median",
+        "poverty_severity_40_median",
+        "poverty_severity_50_median",
+        "poverty_severity_60_median",
+        "waits_40_median",
+        "waits_50_median",
+        "waits_60_median",
+        "palma_ratio",
+        "s80_s20_ratio",
+        "p90_p10_ratio",
+        "p90_p50_ratio",
+        "p50_p10_ratio",
+    ]
+
+    # Make nan the data for Poland from 2020 onwards, except for columns in cols_to_keep
+    mask = (tb["country"] == "Poland") & (tb["year"] >= 2020)
+    tb.loc[mask, tb.columns.difference(cols_to_keep)] = np.nan
+
+    return tb
diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml
new file mode 100644
index 00000000000..c25d010b2fc
--- /dev/null
+++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml
@@ -0,0 +1,4 @@
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/
+dataset:
+  title: World Bank Poverty and Inequality Platform (PIP) (2011 prices)
diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py
new file mode 100644
index 00000000000..92ee99a0383
--- /dev/null
+++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py
@@ -0,0 +1,34 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("world_bank_pip")
+
+    # Read tables from garden dataset.
+    tb = ds_garden["income_consumption_2011"]
+
+    #
+    # Process data.
+    #
+    # Drop reporting_level and welfare_type columns
+    tb = tb.drop(columns=["reporting_level", "welfare_type"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml
new file mode 100644
index 00000000000..4afca360dd5
--- /dev/null
+++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml
@@ -0,0 +1,4 @@
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/
+dataset:
+  title: World Bank Poverty and Inequality Platform (PIP) (2017 prices)
diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py
new file mode 100644
index 00000000000..319cf11c36e
--- /dev/null
+++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py
@@ -0,0 +1,34 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("world_bank_pip")
+
+    # Read tables from garden dataset.
+    tb = ds_garden["income_consumption_2017"]
+
+    #
+    # Process data.
+    #
+    # Drop reporting_level and welfare_type columns
+    tb = tb.drop(columns=["reporting_level", "welfare_type"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
+    )
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py
new file mode 100644
index 00000000000..90c84c0726d
--- /dev/null
+++ b/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py
@@ -0,0 +1,51 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshots.
+    # For key indicators
+    snap = paths.load_snapshot("world_bank_pip.csv")
+    tb = snap.read()
+
+    # For percentiles
+    snap_percentiles = paths.load_snapshot("world_bank_pip_percentiles.csv")
+    tb_percentiles = snap_percentiles.read()
+
+    #
+    # Process data.
+    #
+
+    # Make reporting_level and welfare_type strings
+    tb["reporting_level"] = tb["reporting_level"].astype(str)
+    tb["welfare_type"] = tb["welfare_type"].astype(str)
+    tb_percentiles["reporting_level"] = tb_percentiles["reporting_level"].astype(str)
+    tb_percentiles["welfare_type"] = tb_percentiles["welfare_type"].astype(str)
+
+    # Set index and sort
+    tb = tb.set_index(
+        ["ppp_version", "poverty_line", "country", "year", "reporting_level", "welfare_type"], verify_integrity=True
+    ).sort_index()
+
+    tb_percentiles = tb_percentiles.set_index(
+        ["ppp_version", "country", "year", "reporting_level", "welfare_type", "percentile"],
+        verify_integrity=True,
+    ).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(
+        dest_dir, tables=[tb, tb_percentiles], check_variables_metadata=True, default_metadata=snap.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_meadow.save()
diff --git a/snapshots/wb/2024-03-27/pip_api.py b/snapshots/wb/2024-03-27/pip_api.py
new file mode 100644
index 00000000000..8b4d17d1c94
--- /dev/null
+++ b/snapshots/wb/2024-03-27/pip_api.py
@@ -0,0 +1,1573 @@
+"""
+DATA EXTRACTION FOR THE WORLD BANK POVERTY AND INEQUALITY PLATFORM (PIP) API
+
+This code generates key indicators and percentiles from the World Bank PIP API.
+This is done by combining the results of several queries to the API:
+    - A set of poverty lines (8) to obtain key indicators per PPP year (2011, 2017) and for countries and regions.
+    - 2298 poverty lines to construct percentiles for a group of countries.
+    - 5148 poverty lines to construct percentiles for all the regions.
+    - 8217 of poverty lines to construct estimates of relative poverty.
+
+Percentiles are partially constructed because the data officially published by the World Bank is missing some countries and all the regions.
+
+To run this code from scratch,
+    - Connect to the staging server of this pull request:
+        - Hit Cmd + Shift + P and select Remote-SSH: Connect to Host
+        - Type in owid@staging-site-{pull_request_name}
+    - Delete the files in the cache folder:
+        rm -rf .cache/*
+    - Check if you need to update the poverty lines in the functions `poverty_lines_countries` and `poverty_lines_regions`. Run
+        - https://api.worldbank.org/pip/v1/pip?country=CHN&year=all&povline=80&fill_gaps=false&welfare_type=all&reporting_level=all&additional_ind=false&ppp_version=2017&identity=PROD&format=csv
+        - https://api.worldbank.org/pip/v1/pip-grp?country=OHI&year=all&povline=300&group_by=wb&welfare_type=all&reporting_level=all&additional_ind=false&ppp_version=2017&format=csv
+        - And see if any of the `headcount` values is lower than 0.99. If so, you need to add more poverty lines to the functions.
+    - Run the code. You have two options to see the output, in the terminal or in the background:
+        python snapshots/wb/{version}/pip_api.py
+        nohup poetry run python snapshots/wb/{version}/pip_api.py > output.log 2>&1 &
+
+When the code finishes, you will have the following files in the cache folder:
+    - world_bank_pip.csv: file with the results of the queries for key indicators (8 for countries and 8 for regions), plus some additional indicators (thresholds, relative poverty).
+    - pip_percentiles.csv: file with the percentiles taken from WB Databank and the ones constructed from the API.
+
+Copy these files to this folder and run in the terminal:
+    python snapshots/wb/{version}/world_bank_pip.py --path-to-file snapshots/wb/{version}/world_bank_pip.csv
+    python snapshots/wb/{version}/world_bank_pip_percentiles.py --path-to-file snapshots/wb/{version}/pip_percentiles.csv
+
+You can delete the files after this.
+
+"""
+
+
+import io
+import time
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+
+import click
+import numpy as np
+import pandas as pd
+import requests
+from botocore.exceptions import ClientError
+from joblib import Memory
+from structlog import get_logger
+from tenacity import retry
+from tenacity.stop import stop_after_attempt
+from tenacity.wait import wait_random_exponential
+
+from etl.files import checksum_str
+from etl.paths import CACHE_DIR
+from etl.publish import connect_s3_cached
+
+# Initialize logger.
+log = get_logger()
+
+memory = Memory(CACHE_DIR, verbose=0)
+
+# Basic parameters to use in the functions
+MAX_REPEATS = 15
+TIMEOUT = 500
+FILL_GAPS = "false"
+# NOTE: Although the number of workers is set to MAX_WORKERS, the actual number of workers for regional queries is half of that, because the API (`pip-grp`) is less able to handle concurrent requests.
+MAX_WORKERS = 2
+TOLERANCE_PERCENTILES = 1
+
+
+# Select live (1) or internal (0) API
+LIVE_API = 1
+
+
+# Constants
+def poverty_lines_countries():
+    """
+    These poverty lines are used to calculate percentiles for countries that are not in the percentile file.
+    # We only extract to $80 because the highest P99 not available is China, with $64.5
+    # NOTE: In future updates, check if these poverty lines are enough for the extraction
+    """
+    # Define poverty lines and their increase
+
+    under_2_dollars = list(range(1, 200, 1))
+    between_2_and_5_dollars = list(range(200, 500, 2))
+    between_5_and_10_dollars = list(range(500, 1000, 5))
+    between_10_and_20_dollars = list(range(1000, 2000, 10))
+    between_20_and_30_dollars = list(range(2000, 3000, 10))
+    between_30_and_55_dollars = list(range(3000, 5500, 10))
+    between_55_and_80_dollars = list(range(5500, 8000, 10))
+    between_80_and_100_dollars = list(range(8000, 10000, 10))
+    between_100_and_150_dollars = list(range(10000, 15000, 10))
+
+    # povlines is all these lists together
+    povlines = (
+        under_2_dollars
+        + between_2_and_5_dollars
+        + between_5_and_10_dollars
+        + between_10_and_20_dollars
+        + between_20_and_30_dollars
+        + between_30_and_55_dollars
+        + between_55_and_80_dollars
+        + between_80_and_100_dollars
+        + between_100_and_150_dollars
+    )
+
+    return povlines
+
+
+def poverty_lines_regions():
+    """
+    These poverty lines are used to calculate percentiles for regions. None of them are in the percentile file.
+    # We only extract to $300 because the highest P99 not available is Other High Income Countries, with $280
+    # NOTE: In future updates, check if these poverty lines are enough for the extraction
+    """
+    # Define poverty lines and their increase
+
+    under_2_dollars = list(range(1, 200, 1))
+    between_2_and_5_dollars = list(range(200, 500, 2))
+    between_5_and_10_dollars = list(range(500, 1000, 5))
+    between_10_and_20_dollars = list(range(1000, 2000, 10))
+    between_20_and_30_dollars = list(range(2000, 3000, 10))
+    between_30_and_55_dollars = list(range(3000, 5500, 10))
+    between_55_and_80_dollars = list(range(5500, 8000, 10))
+    between_80_and_100_dollars = list(range(8000, 10000, 10))
+    between_100_and_150_dollars = list(range(10000, 15000, 10))
+    between_150_and_175_dollars = list(range(15000, 17500, 10))
+    between_175_and_250_dollars = list(range(17500, 25000, 20))
+    between_250_and_300_dollars = list(range(25000, 30000, 50))
+
+    # povlines is all these lists together
+    povlines = (
+        under_2_dollars
+        + between_2_and_5_dollars
+        + between_5_and_10_dollars
+        + between_10_and_20_dollars
+        + between_20_and_30_dollars
+        + between_30_and_55_dollars
+        + between_55_and_80_dollars
+        + between_80_and_100_dollars
+        + between_100_and_150_dollars
+        + between_150_and_175_dollars
+        + between_175_and_250_dollars
+        + between_250_and_300_dollars
+    )
+
+    return povlines
+
+
+# Define poverty lines for key indicators, depending on the PPP version.
+# It includes the international poverty line, lower and upper-middle income lines, and some other lines.
+POVLINES_DICT = {
+    2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000],
+    2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000],
+}
+
+
+PPP_VERSIONS = [2011, 2017]
+POV_LINES_COUNTRIES = poverty_lines_countries()
+POV_LINES_REGIONS = poverty_lines_regions()
+
+# # DEBUGGING
+# PPP_VERSIONS = [2017]
+# POV_LINES_COUNTRIES = [1, 1000, 25000, 50000]
+# POV_LINES_REGIONS = [1, 1000, 25000, 50000]
+
+
+@click.command()
+@click.option(
+    "--live-api/--internal-api",
+    default=True,
+    type=bool,
+    help="Select live (1) or internal (0) API",
+)
+# @click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.")
+def run(live_api: bool) -> None:
+    if live_api:
+        wb_api = WB_API("https://api.worldbank.org/pip/v1")
+    else:
+        wb_api = WB_API("https://apiv2qa.worldbank.org/pip/v1")
+
+    # Generate percentiles by extracting the raw files and processing them afterward
+    df_percentiles = generate_consolidated_percentiles(generate_percentiles_raw(wb_api), wb_api)
+
+    # Generate relative poverty indicators file
+    df_relative = generate_relative_poverty(wb_api)
+
+    # Generate key indicators file and patch medians
+    df = generate_key_indicators(wb_api)
+    df = median_patch(df, country_or_region="country")
+
+    # Add relative poverty indicators and decile thresholds to the key indicators file
+    df = add_relative_poverty_and_decile_threholds(df, df_relative, df_percentiles)
+
+
+class WB_API:
+    def __init__(self, api_address, check_health=False):
+        self.api_address = api_address
+        self.check_health = check_health
+
+    def health_check(self):
+        return pd.read_json(f"{self.api_address}/health-check")[0][0]
+
+    def api_health(self):
+        """
+        Check if the API is running.
+        """
+        if not self.check_health:
+            return
+
+        # Initialize repeat counter
+        repeat = 0
+
+        # health comes from a json containing the status
+        health = self.health_check()
+
+        # If the status is different to "PIP API is running", repeat the request until MAX_REPEATS
+        while health != "PIP API is running" and repeat < MAX_REPEATS:
+            repeat += 1
+
+        if repeat >= MAX_REPEATS:
+            # If the status is different to "PIP API is running" after MAX_REPEATS, log fatal error
+            raise AssertionError(f"Health check: {health} (repeated {repeat} times)")
+
+    def versions(self):
+        return memory.cache(pd.read_csv)(f"{self.api_address}/versions?format=csv")
+
+    def get_table(self, table):
+        return pd.read_csv(f"{self.api_address}/aux?table={table}&long_format=false&format=csv")
+
+    def fetch_csv(self, url):
+        return _fetch_csv(f"{self.api_address}{url}")
+
+
+@retry(wait=wait_random_exponential(multiplier=1), stop=stop_after_attempt(MAX_REPEATS))
+def _get_request(url: str) -> requests.Response:
+    response = requests.get(url, timeout=TIMEOUT)
+    if response.status_code != 200:
+        log.info("fetch_csv.retry", url=url)
+        raise Exception("API timed out")
+
+    if b"Server Error" in response.content:
+        raise Exception("API returned server error")
+
+    return response
+
+
+@memory.cache
+def _fetch_csv(url: str) -> pd.DataFrame:
+    r2 = connect_s3_cached()
+    r2_bucket = "owid-private"
+    r2_key = "cache/pip_api/" + checksum_str(url)
+
+    # try to get it from cache
+    try:
+        obj = r2.get_object(Bucket=r2_bucket, Key=r2_key)
+        s = obj["Body"].read().decode("utf-8")
+        # we might have cached invalid responses, in that case fetch it again
+        if "Server Error" not in s:
+            df = pd.read_csv(io.StringIO(s))
+            log.info("fetch_csv.cache_hit", url=url)
+            return df
+        else:
+            log.info("fetch_csv.cache_with_error", url=url)
+    except ClientError:
+        pass
+
+    log.info("fetch_csv.start", url=url)
+    response = _get_request(url)
+    log.info("fetch_csv.success", url=url, t=response.elapsed.total_seconds())
+
+    # save the result to R2 cache
+    r2.put_object(
+        Body=response.content,
+        Bucket=r2_bucket,
+        Key=r2_key,
+    )
+
+    df = pd.read_csv(io.StringIO(response.content.decode("utf-8")))
+    return df
+
+
+@memory.cache
+def _fetch_percentiles(version: int) -> pd.DataFrame:
+    # These URLs were copied from https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles
+    if version == 2011:
+        url = "https://datacatalogfiles.worldbank.org/ddh-published/0063646/DR0090357/world_100bin.csv"
+    elif version == 2017:
+        url = "https://datacatalogfiles.worldbank.org/ddh-published/0063646/DR0090251/world_100bin.csv"
+    else:
+        raise ValueError(f"Version {version} is not supported")
+    return pd.read_csv(url)
+
+
+############################################################################################################
+# FUNCTIONS
+
+
+def pip_aux_tables(wb_api: WB_API, table="all"):
+    """
+    Download aux tables if the API is running.
+    """
+
+    wb_api.api_health()
+
+    if table == "all":
+        aux_tables_list = [
+            "aux_versions",
+            "countries",
+            "country_coverage",
+            "country_list",
+            "cpi",
+            "decomposition",
+            "dictionary",
+            "framework",
+            "gdp",
+            "incgrp_coverage",
+            "indicators",
+            "interpolated_means",
+            "missing_data",
+            "national_poverty_lines",
+            "pce",
+            "pop",
+            "pop_region",
+            "poverty_lines",
+            "ppp",
+            "region_coverage",
+            "regions",
+            "spl",
+            "survey_means",
+        ]
+        # Create a list of dataframes
+        df_dict = {}
+
+        # Download each table and append it to the list
+        for tab in aux_tables_list:
+            df = wb_api.get_table(tab)
+
+            # Add table to df_dict
+            df_dict[tab] = df
+
+    else:
+        df = wb_api.get_table(table)
+
+        # Add table to df_dict
+        df_dict = {table: df}
+
+    log.info(f'Auxiliary tables downloaded ("{table}")')
+
+    return df_dict
+
+
+def pip_versions(wb_api) -> dict:
+    """
+    Download latest PIP data versions if the API is running.
+    """
+
+    wb_api.api_health()
+
+    df = wb_api.versions()
+    df = df[["ppp_version", "release_version", "version"]]
+
+    # Obtain the max release_version
+    max_release_version = df["release_version"].max()
+
+    # Get the version for ppp_versions 2011 and 2017
+    versions = df[df["release_version"] == max_release_version]
+
+    # Set index and convert to dict
+    versions = versions.set_index("ppp_version", verify_integrity=True).sort_index().to_dict(orient="index")
+
+    version_2011 = versions[2011]["version"]
+    version_2017 = versions[2017]["version"]
+
+    log.info(f"PIP dataset versions extracted: 2011 = {version_2011}, 2017 = {version_2017}")
+
+    return versions
+
+
+def pip_query_country(
+    wb_api: WB_API,
+    popshare_or_povline,
+    value,
+    versions,
+    country_code="all",
+    year="all",
+    fill_gaps="true",
+    welfare_type="all",
+    reporting_level="all",
+    ppp_version=2017,
+    download="false",
+) -> pd.DataFrame:
+    """
+    Query country data from the PIP API.
+    """
+
+    # Test health of the API
+    wb_api.api_health()
+
+    # Round povline (popshare) to 2 decimals to work with cents as the minimum unit
+    value = round(value, 2)
+
+    # Extract version and release_version from versions dict
+    version = versions[ppp_version]["version"]
+    release_version = versions[ppp_version]["release_version"]
+
+    # Build query
+    df = wb_api.fetch_csv(
+        f"/pip?{popshare_or_povline}={value}&country={country_code}&year={year}&fill_gaps={fill_gaps}&welfare_type={welfare_type}&reporting_level={reporting_level}&ppp_version={ppp_version}&version={version}&release_version={release_version}&format=csv"
+    )
+
+    # Add PPP version as column
+    df["ppp_version"] = ppp_version
+
+    # Replace names of columns and drop redundancies
+    df = df.rename(columns={"country_name": "country", "reporting_year": "year"})
+    df = df.drop(columns=["region_name", "region_code"])
+
+    # Reorder columns: ppp_version, country, year, povline and the rest
+    first_columns = ["ppp_version", "country", "year", "poverty_line"]
+    df = df[first_columns + [column for column in df.columns if column not in first_columns]]
+
+    if download == "true":
+        # make sure the directory exists. If not, create it
+        Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True)
+        # Save to csv
+        df.to_csv(
+            f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_{year}_{popshare_or_povline}_{int(round(value*100))}_welfare_{welfare_type}_rep_{reporting_level}_fillgaps_{fill_gaps}_ppp_{ppp_version}.csv",
+            index=False,
+        )
+
+    if country_code == "all":
+        log.info(f"Country data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs)")
+    else:
+        log.info(
+            f"Country data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs) in {country_code} (year = {year})"
+        )
+
+    return df
+
+
+def pip_query_region(
+    wb_api: WB_API,
+    popshare_or_povline,
+    value,
+    versions,
+    country_code="all",
+    year="all",
+    welfare_type="all",
+    reporting_level="all",
+    ppp_version=2017,
+    download="false",
+) -> pd.DataFrame:
+    """
+    Query regional data from the PIP API.
+    """
+
+    # Test health of the API
+    wb_api.api_health()
+
+    # Round povline (popshare) to 2 decimals to work with cents as the minimum unit
+    value = round(value, 2)
+
+    # Extract version and release_version from versions dict
+    version = versions[ppp_version]["version"]
+    release_version = versions[ppp_version]["release_version"]
+
+    # Build query
+    df = wb_api.fetch_csv(
+        f"/pip-grp?{popshare_or_povline}={value}&country={country_code}&year={year}&welfare_type={welfare_type}&reporting_level={reporting_level}&ppp_version={ppp_version}&version={version}&release_version={release_version}&format=csv"
+    )
+
+    # Add PPP version as column
+    df["ppp_version"] = ppp_version
+
+    # Replace names of columns and drop redundancies
+    df = df.rename(columns={"region_name": "country", "reporting_year": "year", "region_code": "country_code"})
+
+    # Reorder columns: ppp_version, country, year, povline and the rest
+    first_columns = ["ppp_version", "country", "year", "poverty_line"]
+    df = df[first_columns + [column for column in df.columns if column not in first_columns]]
+
+    if download == "true":
+        # make sure the directory exists. If not, create it
+        Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True)
+        # Save to csv
+        df.to_csv(
+            f"{CACHE_DIR}/pip_region_data/pip_region_{country_code}_year_{year}_{popshare_or_povline}_{int(round(value*100))}_ppp_{ppp_version}.csv",
+            index=False,
+        )
+
+    if country_code == "all":
+        log.info(f"Regional data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs)")
+    else:
+        log.info(
+            f"Regional data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs) in {country_code} (year = {year})"
+        )
+
+    return df
+
+
+# GENERATE PERCENTILES FILES
+# This is data not given directly by the query, but we can get it by querying a huge set of poverty lines and assign percentiles according to headcount ratio results.
+
+
+def generate_percentiles_raw(wb_api: WB_API):
+    """
+    Generates percentiles data from query results. This is the raw data to get the percentiles.
+    Uses concurrent.futures to speed up the process.
+    """
+    start_time = time.time()
+
+    def get_percentiles_data(povline, versions, ppp_version, country_code):
+        """
+        Check if country percentiles data exists. If not, run the query.
+        """
+        if Path(
+            f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_all_povline_{povline}_welfare_all_rep_all_fillgaps_{FILL_GAPS}_ppp_{ppp_version}.csv"
+        ).is_file():
+            return
+
+        else:
+            return pip_query_country(
+                wb_api,
+                popshare_or_povline="povline",
+                value=povline / 100,
+                versions=versions,
+                country_code=country_code,
+                year="all",
+                fill_gaps=FILL_GAPS,
+                welfare_type="all",
+                reporting_level="all",
+                ppp_version=ppp_version,
+                download="true",
+            )
+
+    def concurrent_percentiles_function(country_code):
+        """
+        Executes get_percentiles_data concurrently.
+        """
+        # Make sure the directory exists. If not, create it
+        Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True)
+
+        with ThreadPool(MAX_WORKERS) as pool:
+            tasks = [
+                (povline, versions, ppp_version, country_code)
+                for ppp_version in PPP_VERSIONS
+                for povline in POV_LINES_COUNTRIES
+            ]
+            pool.starmap(get_percentiles_data, tasks)
+
+    def get_percentiles_data_region(povline, versions, ppp_version):
+        """
+        Check if region percentiles data exists. If not, run the query.
+        """
+        if Path(
+            f"{CACHE_DIR}/pip_region_data/pip_region_all_year_all_povline_{povline}_ppp_{ppp_version}.csv"
+        ).is_file():
+            return
+        else:
+            return pip_query_region(
+                wb_api,
+                popshare_or_povline="povline",
+                value=povline / 100,
+                versions=versions,
+                country_code="all",
+                year="all",
+                welfare_type="all",
+                reporting_level="all",
+                ppp_version=ppp_version,
+                download="true",
+            )
+
+    def concurrent_percentiles_region_function():
+        """
+        Executes get_percentiles_data_region concurrently.
+        """
+        # Make sure the directory exists. If not, create it
+        Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True)
+        with ThreadPool(MAX_WORKERS) as pool:
+            tasks = [(povline, versions, ppp_version) for ppp_version in PPP_VERSIONS for povline in POV_LINES_REGIONS]
+            pool.starmap(get_percentiles_data_region, tasks)
+
+    def get_query_country(povline, ppp_version, country_code):
+        """
+        Here I check if the country file exists even after the original extraction. If it does, I read it. If not, I start the queries again.
+        """
+        file_path_country = f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_all_povline_{povline}_welfare_all_rep_all_fillgaps_{FILL_GAPS}_ppp_{ppp_version}.csv"
+        if Path(file_path_country).is_file():
+            df_query_country = pd.read_csv(file_path_country)
+        else:
+            # Run the main function to get the data
+            log.warning(
+                f"We need to come back to the extraction! countries = {country_code}, {povline}, {ppp_version} PPPs)"
+            )
+            get_percentiles_data(povline, versions, ppp_version, country_code)
+            df_query_country = pd.read_csv(file_path_country)
+
+        return df_query_country
+
+    def get_query_region(povline, ppp_version):
+        """
+        Here I check if the regional file exists even after the original extraction. If it does, I read it. If not, I start the queries again.
+        """
+        file_path_region = (
+            f"{CACHE_DIR}/pip_region_data/pip_region_all_year_all_povline_{povline}_ppp_{ppp_version}.csv"
+        )
+        if Path(file_path_region).is_file():
+            df_query_region = pd.read_csv(file_path_region)
+        else:
+            # Run the main function to get the data
+            log.warning(f"We need to come back to the extraction! regions, {povline}, {ppp_version} PPPs)")
+            get_percentiles_data_region(povline, versions, ppp_version)
+            df_query_region = pd.read_csv(file_path_region)
+
+        return df_query_region
+
+    def get_list_of_missing_countries():
+        """
+        Compare the list of countries in a common query (reference file) and the list of countries in the percentile file.
+        It generates missing_countries, which is a string with all the elements of the list, in the format for querying multiple countries in the API.
+        And also missing_countries_list, which is a list of the countries.
+        """
+        # Obtain the percentile files the World Bank publishes in their Databank
+
+        df_percentiles_published_2017 = _fetch_percentiles(2017)
+
+        # FOR COUNTRIES
+        # Get data from the most common query
+        df_reference = pip_query_country(
+            wb_api,
+            popshare_or_povline="povline",
+            value=2.15,
+            versions=versions,
+            country_code="all",
+            year="all",
+            fill_gaps=FILL_GAPS,
+            welfare_type="all",
+            reporting_level="all",
+            ppp_version=2017,
+        )
+
+        # Edit percentile file to get the list of different countries
+        df_percentiles_pub = df_percentiles_published_2017.copy()
+        df_percentiles_pub = df_percentiles_pub.drop(
+            columns=["percentile", "avg_welfare", "pop_share", "welfare_share", "quantile"]
+        ).drop_duplicates()
+
+        # Merge the two files
+        df_merge = pd.merge(
+            df_reference,
+            df_percentiles_pub,
+            on=["country_code", "year", "reporting_level", "welfare_type"],
+            how="outer",
+            indicator=True,
+        )
+
+        # Obtain the list of countries that are in the reference file but not in the percentile file
+        list_missing_countries = df_merge.loc[df_merge["_merge"] == "left_only", "country_code"].unique().tolist()
+
+        # Generate a string with all the elements of the list, in the format for querying multiple countries in the API
+        missing_countries = "&country=".join(list_missing_countries)
+
+        return missing_countries, list_missing_countries
+
+    # Obtain latest versions of the PIP dataset
+    versions = pip_versions(wb_api)
+
+    # Run the main function
+    missing_countries, list_missing_countries = get_list_of_missing_countries()
+    log.info(
+        f"These countries are available in a common query but not in the percentile file: {list_missing_countries}"
+    )
+
+    concurrent_percentiles_function(country_code=missing_countries)
+    log.info("Country files downloaded")
+    concurrent_percentiles_region_function()
+    log.info("Region files downloaded")
+
+    log.info("Now we are concatenating the files")
+
+    with ThreadPool(MAX_WORKERS) as pool:
+        tasks = [
+            (povline, ppp_version, missing_countries) for ppp_version in PPP_VERSIONS for povline in POV_LINES_COUNTRIES
+        ]
+        dfs = pool.starmap(get_query_country, tasks)
+
+    df_country = pd.concat(dfs, ignore_index=True)
+    log.info("Country files concatenated")
+
+    with ThreadPool(MAX_WORKERS) as pool:
+        tasks = [(povline, ppp_version) for ppp_version in PPP_VERSIONS for povline in POV_LINES_REGIONS]
+        dfs = pool.starmap(get_query_region, tasks)
+
+    df_region = pd.concat(dfs, ignore_index=True)
+    log.info("Region files concatenated")
+
+    # Create poverty_line_cents column, multiplying by 100, rounding and making it an integer
+    df_country["poverty_line_cents"] = round(df_country["poverty_line"] * 100).astype(int)
+    df_region["poverty_line_cents"] = round(df_region["poverty_line"] * 100).astype(int)
+
+    log.info("Checking if all the poverty lines are in the concatenated files")
+
+    # Check if all the poverty lines are in the df in country and region df
+    assert set(df_country["poverty_line_cents"].unique()) == set(POV_LINES_COUNTRIES), log.fatal(
+        "Not all poverty lines are in the country file!"
+    )
+    assert set(df_region["poverty_line_cents"].unique()) == set(POV_LINES_REGIONS), log.fatal(
+        "Not all poverty lines are in the region file!"
+    )
+
+    # Drop poverty_line_cents column
+    df_country = df_country.drop(columns=["poverty_line_cents"])
+    df_region = df_region.drop(columns=["poverty_line_cents"])
+
+    log.info("Checking if the set of countries and regions is the same as in PIP")
+
+    # I check if the set of countries is the same in the df and in the list of missing countries
+    assert set(df_country["country_code"].unique()) == set(list_missing_countries), log.fatal(
+        f"List of countries is different from the one we needed to extract! ({list_missing_countries})"
+    )
+
+    # I check if the set of regions is the same in the df and in the aux table (list of regions)
+    aux_dict = pip_aux_tables(wb_api, table="regions")
+    assert set(df_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal(
+        "List of regions is not the same as the one defined in PIP!"
+    )
+
+    log.info("Concatenating the raw percentile data for countries and regions")
+
+    # Concatenate df_country and df_region
+    df = pd.concat([df_country, df_region], ignore_index=True)
+
+    end_time = time.time()
+    elapsed_time = round(end_time - start_time, 2)
+    log.info(
+        f"Concatenation of raw percentile data for countries and regions completed. Execution time: {elapsed_time} seconds"
+    )
+
+    return df
+
+
+def calculate_percentile(p, df):
+    """
+    Calculates a single percentile and returns a DataFrame with the results.
+    """
+    df["distance_to_p"] = abs(df["headcount"] * 100 - p)
+    df_closest = (
+        df.sort_values("distance_to_p")
+        .groupby(
+            ["ppp_version", "country", "year", "reporting_level", "welfare_type"],
+            as_index=False,
+            sort=False,
+            dropna=False,  # This is to avoid dropping rows with NaNs (reporting_level and welfare_type for regions)
+        )
+        .first()
+    )
+    df_closest["target_percentile"] = p
+    df_closest = df_closest[
+        [
+            "ppp_version",
+            "country",
+            "year",
+            "reporting_level",
+            "welfare_type",
+            "target_percentile",
+            "poverty_line",
+            "headcount",
+            "distance_to_p",
+        ]
+    ]
+    log.info(f"Percentile {p}: calculated")
+    return df_closest
+
+
+def format_official_percentiles(year, wb_api: WB_API):
+    """
+    Download percentiles from the World Bank Databank and format them to the same format as the constructed percentiles
+    """
+    # Load percentile files from the World Bank Databank
+    df_percentiles_published = _fetch_percentiles(year)
+
+    # Obtain country names from the aux table
+    aux_dict = pip_aux_tables(wb_api, table="countries")
+    df_countries = aux_dict["countries"]
+
+    # Merge the two files to get country names
+    df_percentiles_published = pd.merge(
+        df_percentiles_published,
+        df_countries[["country_code", "country_name"]],
+        on="country_code",
+        how="left",
+    )
+
+    # Rename columns
+    df_percentiles_published = df_percentiles_published.rename(
+        columns={
+            "country_name": "country",
+            "percentile": "target_percentile",
+            "avg_welfare": "avg",
+            "welfare_share": "share",
+            "quantile": "thr",
+        }
+    )
+
+    # Drop pop_share
+    df_percentiles_published = df_percentiles_published.drop(columns=["pop_share"])
+
+    # Make thr null if target_percentile is 100
+    df_percentiles_published.loc[df_percentiles_published["target_percentile"] == 100, "thr"] = np.nan
+
+    # Add ppp_version column
+    df_percentiles_published["ppp_version"] = year
+
+    return df_percentiles_published
+
+
+def generate_consolidated_percentiles(df, wb_api: WB_API):
+    """
+    Generates percentiles from the raw data. This is the final file with percentiles.
+    """
+    start_time = time.time()
+
+    path_file_percentiles = f"{CACHE_DIR}/pip_percentiles_before_checks.csv"
+
+    if Path(path_file_percentiles).is_file():
+        log.info("Percentiles file already exists. No need to consolidate.")
+        df_percentiles = pd.read_csv(path_file_percentiles)
+
+    else:
+        log.info("Consolidating percentiles")
+
+        # Define percentiles, from 1 to 99
+        percentiles = range(1, 100, 1)
+        df_percentiles = pd.DataFrame()
+
+        # Estimate percentiles
+        dfs = [calculate_percentile(p, df) for p in percentiles]
+
+        df_percentiles = pd.concat(dfs, ignore_index=True)
+
+        log.info("Percentiles calculated and consolidated")
+
+        # Rename headcount to estimated_percentile and poverty_line to thr
+        df_percentiles = df_percentiles.rename(columns={"headcount": "estimated_percentile", "poverty_line": "thr"})  # type: ignore
+
+        # Add official percentiles from the World Bank Databank
+        df_percentiles_published_2011 = format_official_percentiles(2011, wb_api)
+        df_percentiles_published_2017 = format_official_percentiles(2017, wb_api)
+
+        df_percentiles = pd.concat(
+            [df_percentiles, df_percentiles_published_2011, df_percentiles_published_2017], ignore_index=True
+        )
+
+        # Drop duplicates. Keep the second one (the official one)
+        df_percentiles = df_percentiles.drop_duplicates(
+            subset=["ppp_version", "country", "year", "reporting_level", "welfare_type", "target_percentile"],
+            keep="last",
+        )
+
+        # Sort by ppp_version, country, year, reporting_level, welfare_type and target_percentile
+        df_percentiles = df_percentiles.sort_values(
+            by=["ppp_version", "country", "year", "reporting_level", "welfare_type", "target_percentile"]
+        )
+
+        # Save to csv
+        df_percentiles.to_csv(f"{CACHE_DIR}/pip_percentiles_before_checks.csv", index=False)
+
+    # SANITY CHECKS
+    df_percentiles = sanity_checks(df_percentiles)
+
+    # Drop distance_to_p, estimated_percentile, country_code
+    df_percentiles = df_percentiles.drop(columns=["distance_to_p", "estimated_percentile", "country_code"])
+
+    # Rename target_percentile to percentile
+    df_percentiles = df_percentiles.rename(columns={"target_percentile": "percentile"})
+
+    # Save to csv
+    df_percentiles.to_csv(f"{CACHE_DIR}/pip_percentiles.csv", index=False)
+
+    end_time = time.time()
+    elapsed_time = round(end_time - start_time, 2)
+    log.info(f"Percentiles calculated and checked. Execution time: {elapsed_time} seconds")
+
+    return df_percentiles
+
+
+def sanity_checks(df_percentiles):
+    """
+    Run different sanity checks to the percentiles file.
+    """
+    log.info("Starting sanity checks")
+
+    # Count number of rows before checks
+    rows_before = len(df_percentiles)
+
+    # Consecutive percentiles (1, 2, 3, etc)
+    # Create a column called check that is True if target_percentile is consecutive for each ppp_version, country, year, reporting_level, and welfare_type
+    df_percentiles["check"] = (
+        df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[
+            "target_percentile"
+        ].diff()
+        == 1
+    )
+
+    # Replace check with True if target_percentile is 1
+    df_percentiles.loc[df_percentiles["target_percentile"] == 1, "check"] = True
+
+    # Assign the boolean value to the entire group
+    df_percentiles["check"] = df_percentiles.groupby(
+        ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False
+    )["check"].transform("all")
+
+    # Define mask
+    mask = ~df_percentiles["check"]
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""Percentiles are not consecutive! These distributions will not be used:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Drop faulty distributions
+        df_percentiles = df_percentiles[~mask].reset_index(drop=True)
+
+    ############################################################################################################
+    # Distance_to_p is higher than TOLERANCE_PERCENTILES
+    df_percentiles["check"] = df_percentiles["distance_to_p"] > TOLERANCE_PERCENTILES
+
+    # Assign the boolean value to the entire group
+    df_percentiles["check"] = df_percentiles.groupby(
+        ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False
+    )["check"].transform("any")
+
+    # Define mask
+    mask = df_percentiles["check"]
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""Percentiles are not accurate! These distributions will not be used:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Drop faulty distributions
+        df_percentiles = df_percentiles[~mask].reset_index(drop=True)
+
+    ############################################################################################################
+    # Nulls for thr, avg and share for the entire group of ppp_version, country, year, reporting_level, and welfare_type
+    df_percentiles["check_thr"] = df_percentiles.groupby(
+        ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False
+    )["thr"].transform(lambda x: x.isnull().all())
+    df_percentiles["check_avg"] = df_percentiles.groupby(
+        ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False
+    )["avg"].transform(lambda x: x.isnull().all())
+    df_percentiles["check_share"] = df_percentiles.groupby(
+        ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False
+    )["share"].transform(lambda x: x.isnull().all())
+
+    df_percentiles["check"] = df_percentiles["check_thr"] & df_percentiles["check_avg"] & df_percentiles["check_share"]
+
+    # Define mask
+    mask = df_percentiles["check"]
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""There are null values for thr, avg and share! These distributions need to be corrected:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Drop distributions with null values for thr, avg and share
+        df_percentiles = df_percentiles[~mask].reset_index(drop=True)
+
+    ############################################################################################################
+    # Find negative values for thr
+    df_percentiles["check"] = df_percentiles["thr"] < 0
+
+    # Define mask
+    mask = df_percentiles["check"]
+
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""There are negative values for thr! These distributions need to be corrected:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Correct cases where thr, avg and share are negative, by assigning 0
+        df_percentiles.loc[mask, "thr"] = 0
+
+    ############################################################################################################
+    # Find negative values for avg
+    df_percentiles["check"] = df_percentiles["avg"] < 0
+
+    # Define mask
+    mask = df_percentiles["check"]
+
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""There are negative values for avg! These distributions need to be corrected:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Correct cases where thr, avg and share are negative, by assigning 0
+        df_percentiles.loc[mask, "avg"] = 0
+
+    ############################################################################################################
+    # Find negative values for share
+    df_percentiles["check"] = df_percentiles["share"] < 0
+
+    # Define mask
+    mask = df_percentiles["check"]
+
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""There are negative values for share! These distributions need to be corrected:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Correct cases where thr, avg and share are negative, by assigning 0
+        df_percentiles.loc[mask, "share"] = 0
+
+    ############################################################################################################
+    # thr is increasing for each ppp_version, country, year, reporting_level, and welfare_type
+    df_percentiles["check"] = (
+        df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[
+            "thr"
+        ]
+        .diff()
+        .round(2)
+        >= 0
+    )
+
+    # Replace check with True if thr is NaN
+    df_percentiles.loc[df_percentiles["thr"].isna(), "check"] = True
+
+    # Replace check with True if target_percentile is 1
+    df_percentiles.loc[(df_percentiles["target_percentile"] == 1), "check"] = True
+
+    # Define mask
+    mask = ~df_percentiles["check"]
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""Thresholds are not increasing! These distributions need to be corrected:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Correct cases where thr is not increasing, by repeating the previous thr
+        df_percentiles.loc[mask, "thr"] = df_percentiles.loc[mask, "thr"].shift(1)
+
+    ############################################################################################################
+    # avg is increasing for each ppp_version, country, year, reporting_level, and welfare_type
+    df_percentiles["check"] = (
+        df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[
+            "avg"
+        ]
+        .diff()
+        .round(2)
+        >= 0
+    )
+
+    # Replace check with True if avg is NaN
+    df_percentiles.loc[df_percentiles["avg"].isna(), "check"] = True
+
+    # Replace check with True if target_percentile is 1
+    df_percentiles.loc[(df_percentiles["target_percentile"] == 1), "check"] = True
+
+    # Define mask
+    mask = ~df_percentiles["check"]
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""Averages are not increasing! These distributions need to be corrected:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Correct cases where avg is not increasing, by repeating the previous avg
+        df_percentiles.loc[mask, "avg"] = df_percentiles.loc[mask, "avg"].shift(1)
+
+    ############################################################################################################
+    # Check that avg are between thresholds
+    # Create thr_lower, which is the threshold for the previous percentile
+    df_percentiles["thr_lower"] = df_percentiles.groupby(
+        ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False
+    )["thr"].shift(1)
+    df_percentiles["check"] = (round(df_percentiles["avg"] - df_percentiles["thr_lower"], 2) >= 0) & (
+        round(df_percentiles["thr"] - df_percentiles["avg"]) >= 0
+    )
+
+    # Assign True if target_percentile is 1
+    df_percentiles.loc[df_percentiles["target_percentile"] == 1, "check"] = True
+
+    # Assign True if target_percentile is 100 and avg is greater than thr_lower
+    df_percentiles.loc[
+        (df_percentiles["target_percentile"] == 100)
+        & (round(df_percentiles["avg"] - df_percentiles["thr_lower"], 2) >= 0),
+        "check",
+    ] = True
+
+    # Assign True if avg is null
+    df_percentiles.loc[df_percentiles["avg"].isnull(), "check"] = True
+
+    # Assign the boolean value to the entire group
+    df_percentiles["check"] = df_percentiles.groupby(
+        ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False
+    )["check"].transform("all")
+
+    # Define mask
+    mask = ~df_percentiles["check"]
+    df_error = df_percentiles[mask].reset_index(drop=True).copy()
+
+    if len(df_error) > 0:
+        log.warning(
+            f"""Averages are not between thresholds! These distributions need to be corrected:
+                {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}"""
+        )
+        # Correct cases where avg is not between thresholds, by averaging the two thresholds
+        df_percentiles.loc[mask, "avg"] = (df_percentiles.loc[mask, "thr_lower"] + df_percentiles.loc[mask, "thr"]) / 2
+
+    # Drop check columns
+    df_percentiles = df_percentiles.drop(columns=["check", "check_thr", "check_avg", "check_share", "thr_lower"])
+
+    # Count number of rows after checks
+    rows_after = len(df_percentiles)
+
+    log.info(f"Percentiles file generated. {rows_before - rows_after} rows have been deleted.")
+
+    return df_percentiles
+
+
+# GENERATE RELATIVE POVERTY INDICATORS FILE
+# This is data not given directly by the query, but we can get it by calculating 40, 50, 60% of the median and query
+# NOTE: Medians need to be patched first in order to get data for all country-years (there are several missing values)
+
+
+def generate_relative_poverty(wb_api: WB_API):
+    """
+    Generates relative poverty indicators from query results. Uses concurrent.futures to speed up the process.
+    """
+    start_time = time.time()
+
+    def get_relative_data(df_row, pct, versions):
+        """
+        This function is structured in a way to make it work with concurrent.futures.
+        It checks if the country file related to the row exists. If not, it runs the query.
+        """
+        if ~np.isnan(df_row["median"]):
+            if Path(
+                f"{CACHE_DIR}/pip_country_data/pip_country_{df_row['country_code']}_year_{df_row['year']}_povline_{int(round(df_row['median'] * pct))}_welfare_{df_row['welfare_type']}_rep_{df_row['reporting_level']}_fillgaps_{FILL_GAPS}_ppp_2017.csv"
+            ).is_file():
+                return
+            else:
+                return pip_query_country(
+                    wb_api,
+                    popshare_or_povline="povline",
+                    value=df_row["median"] * pct / 100,
+                    versions=versions,
+                    country_code=df_row["country_code"],
+                    year=df_row["year"],
+                    fill_gaps=FILL_GAPS,
+                    welfare_type=df_row["welfare_type"],
+                    reporting_level=df_row["reporting_level"],
+                    ppp_version=2017,
+                    download="true",
+                )
+
+    def concurrent_relative_function(df):
+        """
+        This is the main function to make concurrency work for country data.
+        """
+        # Make sure the directory exists. If not, create it
+        Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True)
+        with ThreadPool(MAX_WORKERS) as pool:
+            tasks = [(df.iloc[i], pct, versions) for pct in [40, 50, 60] for i in range(len(df))]
+            pool.starmap(get_relative_data, tasks)
+
+    def get_relative_data_region(df_row, pct, versions):
+        """
+        This function is structured in a way to make it work with concurrent.futures.
+        It checks if the regional file related to the row exists. If not, it runs the query.
+        """
+        if ~np.isnan(df_row["median"]):
+            if Path(
+                f"{CACHE_DIR}/pip_region_data/pip_region_{df_row['country_code']}_year_{df_row['year']}_povline_{int(round(df_row['median']*pct))}_ppp_2017.csv"
+            ).is_file():
+                return
+            else:
+                return pip_query_region(
+                    wb_api,
+                    popshare_or_povline="povline",
+                    value=df_row["median"] * pct / 100,
+                    versions=versions,
+                    country_code=df_row["country_code"],
+                    year=df_row["year"],
+                    welfare_type="all",
+                    reporting_level="all",
+                    ppp_version=2017,
+                    download="true",
+                )
+
+    def concurrent_relative_region_function(df):
+        """
+        This is the main function to make concurrency work for regional data.
+        """
+        # Make sure the directory exists. If not, create it
+        Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True)
+        with ThreadPool(int(round(MAX_WORKERS / 2))) as pool:
+            tasks = [(df.iloc[i], pct, versions) for pct in [40, 50, 60] for i in range(len(df))]
+            pool.starmap(get_relative_data_region, tasks)
+
+    def add_relative_indicators(df, country_or_region):
+        """
+        Integrates the relative indicators to the df.
+        """
+        for pct in [40, 50, 60]:
+            # Initialize lists
+            headcount_ratio_list = []
+            pgi_list = []
+            pov_severity_list = []
+            watts_list = []
+            for i in range(len(df)):
+                if ~np.isnan(df["median"].iloc[i]):
+                    if country_or_region == "country":
+                        # Here I check if the file exists even after the original extraction. If it does, I read it. If not, I start the queries again.
+                        file_path = f"{CACHE_DIR}/pip_country_data/pip_country_{df.iloc[i]['country_code']}_year_{df.iloc[i]['year']}_povline_{int(round(df.iloc[i]['median']*pct))}_welfare_{df.iloc[i]['welfare_type']}_rep_{df.iloc[i]['reporting_level']}_fillgaps_{FILL_GAPS}_ppp_2017.csv"
+                        if Path(file_path).is_file():
+                            results = pd.read_csv(file_path)
+                        else:
+                            # Run the main function to get the data
+                            get_relative_data(df.iloc[i], pct, versions)
+                            results = pd.read_csv(file_path)
+
+                    elif country_or_region == "region":
+                        # Here I check if the file exists even after the original extraction. If it does, I read it. If not, I start the queries again.
+                        file_path = f"{CACHE_DIR}/pip_region_data/pip_region_{df.iloc[i]['country_code']}_year_{df.iloc[i]['year']}_povline_{int(round(df.iloc[i]['median']*pct))}_ppp_2017.csv"
+                        if Path(file_path).is_file():
+                            results = pd.read_csv(file_path)
+                        else:
+                            # Run the main function to get the data
+                            get_relative_data_region(df.iloc[i], pct, versions)
+                            results = pd.read_csv(file_path)
+                    else:
+                        raise ValueError("country_or_region must be 'country' or 'region'")
+
+                    headcount_ratio_value = results["headcount"].iloc[0]
+                    headcount_ratio_list.append(headcount_ratio_value)
+
+                    pgi_value = results["poverty_gap"].iloc[0]
+                    pgi_list.append(pgi_value)
+
+                    pov_severity_value = results["poverty_severity"].iloc[0]
+                    pov_severity_list.append(pov_severity_value)
+
+                    watts_value = results["watts"].iloc[0]
+                    watts_list.append(watts_value)
+
+                else:
+                    headcount_ratio_list.append(np.nan)
+                    pgi_list.append(np.nan)
+                    pov_severity_list.append(np.nan)
+                    watts_list.append(np.nan)
+
+            # Add the lists as columns to the df
+            df[f"headcount_ratio_{pct}_median"] = headcount_ratio_list
+            df[f"poverty_gap_index_{pct}_median"] = pgi_list
+            df[f"poverty_severity_{pct}_median"] = pov_severity_list
+            df[f"watts_{pct}_median"] = watts_list
+
+        return df
+
+    # Obtain versions
+    versions = pip_versions(wb_api)
+
+    # FOR COUNTRIES
+    # Get data from the most common query
+    df_country = pip_query_country(
+        wb_api,
+        popshare_or_povline="povline",
+        value=2.15,
+        versions=versions,
+        country_code="all",
+        year="all",
+        fill_gaps=FILL_GAPS,
+        welfare_type="all",
+        reporting_level="all",
+        ppp_version=2017,
+    )
+
+    # Patch medians
+    df_country = median_patch(df_country, country_or_region="country")
+
+    # Run the main function to get the data
+    concurrent_relative_function(df_country)
+
+    # Add relative indicators from the results above
+    df_country = add_relative_indicators(df=df_country, country_or_region="country")
+
+    # FOR REGIONS
+    # Get data from the most common query
+    df_region = pip_query_region(
+        wb_api,
+        popshare_or_povline="povline",
+        value=2.15,
+        versions=versions,
+        country_code="all",
+        year="all",
+        welfare_type="all",
+        reporting_level="all",
+        ppp_version=2017,
+    )
+
+    # Patch medians
+    df_region = median_patch(df_region, country_or_region="region")
+
+    # Run the main function to get the data
+    concurrent_relative_region_function(df_region)
+
+    # Add relative indicators from the results above
+    df_region = add_relative_indicators(df=df_region, country_or_region="region")
+
+    # Concatenate df_country and df_region
+    df = pd.concat([df_country, df_region], ignore_index=True)
+
+    # Save to csv
+    df.to_csv(f"{CACHE_DIR}/pip_relative.csv", index=False)
+
+    end_time = time.time()
+    elapsed_time = round(end_time - start_time, 2)
+    log.info(f"Relative poverty indicators calculated. Execution time: {elapsed_time} seconds")
+
+    return df
+
+
+# GENERATE MAIN INDICATORS FILE
+
+
+def generate_key_indicators(wb_api: WB_API):
+    """
+    Generate the main indicators file, from a set of poverty lines and PPP versions. Uses concurrent.futures to speed up the process.
+    """
+    start_time = time.time()
+
+    def get_country_data(povline, ppp_version, versions):
+        """
+        This function is defined inside the main function because it needs to be called by concurrent.futures.
+        For country data.
+        """
+        return pip_query_country(
+            wb_api,
+            popshare_or_povline="povline",
+            value=povline / 100,
+            versions=versions,
+            country_code="all",
+            year="all",
+            fill_gaps=FILL_GAPS,
+            welfare_type="all",
+            reporting_level="all",
+            ppp_version=ppp_version,
+            download="false",
+        )
+
+    def get_region_data(povline, ppp_version, versions):
+        """
+        This function is defined inside the main function because it needs to be called by concurrent.futures.
+        For regional data.
+        """
+        return pip_query_region(
+            wb_api,
+            popshare_or_povline="povline",
+            value=povline / 100,
+            versions=versions,
+            country_code="all",
+            year="all",
+            welfare_type="all",
+            reporting_level="all",
+            ppp_version=ppp_version,
+            download="false",
+        )
+
+    def concurrent_function():
+        """
+        This function makes concurrency work for country data.
+        """
+        with ThreadPool(MAX_WORKERS) as pool:
+            tasks = [
+                (povline, ppp_version, versions)
+                for ppp_version, povlines in POVLINES_DICT.items()
+                for povline in povlines
+            ]
+            results = pool.starmap(get_country_data, tasks)
+
+        # Concatenate list of dataframes
+        results = pd.concat(results, ignore_index=True)
+
+        return results
+
+    def concurrent_region_function():
+        """
+        This function makes concurrency work for regional data.
+        """
+        with ThreadPool(int(round(MAX_WORKERS / 2))) as pool:
+            tasks = [
+                (povline, ppp_version, versions)
+                for ppp_version, povlines in POVLINES_DICT.items()
+                for povline in povlines
+            ]
+            results = pool.starmap(get_region_data, tasks)
+
+        # Concatenate list of dataframes
+        results = pd.concat(results, ignore_index=True)
+
+        return results
+
+    # Obtain latest versions of the PIP dataset
+    versions = pip_versions(wb_api)
+
+    # Run the main function
+    results = concurrent_function()
+    results_region = concurrent_region_function()
+
+    # If country is nan but country_code is TWN, replace country with Taiwan, China
+    results.loc[results["country"].isnull() & (results["country_code"] == "TWN"), "country"] = "Taiwan, China"
+
+    # I check if the set of countries is the same in the df and in the aux table (list of countries)
+    aux_dict = pip_aux_tables(wb_api, table="countries")
+    assert set(results["country"].unique()) == set(aux_dict["countries"]["country_name"].unique()), log.fatal(
+        f"List of countries is not the same! Differences: {set(results['country'].unique()) - set(aux_dict['countries']['country_name'].unique())}"
+    )
+
+    # I check if the set of regions is the same in the df and in the aux table (list of regions)
+    aux_dict = pip_aux_tables(wb_api, table="regions")
+    assert set(results_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal(
+        f"List of regions is not the same! Differences: {set(results_region['country'].unique()) - set(aux_dict['regions']['region'].unique())}"
+    )
+
+    # Concatenate df_country and df_region
+    df = pd.concat([results, results_region], ignore_index=True)
+
+    # Sort ppp_version, country, year and poverty_line
+    df = df.sort_values(by=["ppp_version", "country", "year", "poverty_line"])  # type: ignore
+
+    # Save to csv
+    df.to_csv(f"{CACHE_DIR}/pip_raw.csv", index=False)
+
+    end_time = time.time()
+    elapsed_time = round(end_time - start_time, 2)
+    log.info(f"Key indicators calculated. Execution time: {elapsed_time} seconds")
+
+    return df
+
+
+def median_patch(df, country_or_region):
+    """
+    Patch missing values in the median column.
+    PIP queries do not return all the medians, so they are patched with the results of the percentile file.
+    """
+
+    # Read percentile file
+    df_percentiles = pd.read_csv(f"{CACHE_DIR}/pip_percentiles.csv")
+
+    # In df_percentiles, keep only the rows with percentile = 50
+    df_percentiles = df_percentiles[df_percentiles["percentile"] == 50].reset_index()
+
+    # If I want to patch the median for regions, I need to drop reporting_level and welfare_type columns
+    if country_or_region == "country":
+        # Merge df and df_percentiles
+        df = pd.merge(
+            df,
+            df_percentiles[["ppp_version", "country", "year", "reporting_level", "welfare_type", "thr"]],
+            on=["ppp_version", "country", "year", "reporting_level", "welfare_type"],
+            how="left",
+        )
+
+        # Replace missing values in median with thr
+        df["median"] = df["median"].fillna(df["thr"])
+
+        # Drop thr column
+        df = df.drop(columns=["thr"])
+
+    elif country_or_region == "region":
+        # Merge df and df_percentiles
+        df = pd.merge(
+            df,
+            df_percentiles[["ppp_version", "country", "year", "thr"]],
+            on=["ppp_version", "country", "year"],
+            how="left",
+        )
+
+        # Rename thr to median
+        df = df.rename(columns={"thr": "median"})
+
+    else:
+        raise ValueError("country_or_region must be 'country' or 'region'")
+
+    log.info("Medians patched!")
+
+    return df
+
+
+def add_relative_poverty_and_decile_threholds(df, df_relative, df_percentiles):
+    """
+    Add relative poverty indicators and decile thresholds to the key indicators file.
+    """
+
+    # Add relative poverty indicators
+    # They don't change with the PPP version, so we can use the 2017 version I estimated before.
+    df = pd.merge(
+        df,
+        df_relative[
+            [
+                "country",
+                "year",
+                "reporting_level",
+                "welfare_type",
+                "headcount_ratio_40_median",
+                "poverty_gap_index_40_median",
+                "poverty_severity_40_median",
+                "watts_40_median",
+                "headcount_ratio_50_median",
+                "poverty_gap_index_50_median",
+                "poverty_severity_50_median",
+                "watts_50_median",
+                "headcount_ratio_60_median",
+                "poverty_gap_index_60_median",
+                "poverty_severity_60_median",
+                "watts_60_median",
+            ]
+        ],
+        on=["country", "year", "reporting_level", "welfare_type"],
+        how="left",
+    )
+
+    # In df_percentiles, keep only the rows with percentile = 10, 20, 30, ... 90
+    df_percentiles = df_percentiles[
+        (df_percentiles["percentile"] % 10 == 0) & (df_percentiles["percentile"] != 100)
+    ].reset_index()
+
+    # Make tb_percentile wide, with percentile as columns
+    df_percentiles = df_percentiles.pivot(
+        index=["ppp_version", "country", "year", "reporting_level", "welfare_type"],
+        columns="percentile",
+        values="thr",
+    )
+
+    # Rename columns
+    df_percentiles.columns = ["decile" + str(int(round(col / 10))) + "_thr" for col in df_percentiles.columns]
+
+    # Reset index
+    df_percentiles = df_percentiles.reset_index()
+
+    # Merge df and df_percentiles
+    df = pd.merge(
+        df,
+        df_percentiles,
+        on=["ppp_version", "country", "year", "reporting_level", "welfare_type"],
+        how="left",
+    )
+
+    # Save key indicators file
+    df.to_csv(f"{CACHE_DIR}/world_bank_pip.csv", index=False)
+
+    log.info("Relative poverty indicators and decile thresholds added. Key indicators file done :)")
+
+    return df
+
+
+if __name__ == "__main__":
+    run()
diff --git a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
new file mode 100644
index 00000000000..4e5434ee522
--- /dev/null
+++ b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
@@ -0,0 +1,31 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: World Bank Poverty and Inequality Platform (PIP)
+    description: |-
+      The Poverty and Inequality Platform (PIP) is an interactive computational tool that offers users quick access to the World Bank’s estimates of poverty, inequality, and shared prosperity. PIP provides a comprehensive view of global, regional, and country-level trends for more than 160 economies around the world.
+    date_published: 2024-03-26
+    version_producer: 20240326_2017, 20240326_2011
+    title_snapshot: Key indicators
+
+    # Citation
+    producer: World Bank Poverty and Inequality Platform
+    citation_full: |-
+      World Bank (2024). Poverty and Inequality Platform (version 20240326_2017 and 20240326_2011) [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed March 27, 2024.
+
+    # Files
+    url_main: https://pip.worldbank.org
+    date_accessed: 2024-03-27
+
+    # License
+    license:
+      name: CC0
+      url: https://datacatalog.worldbank.org/search/dataset/0063646
+
+wdir: ../../../data/snapshots/wb/2024-01-17
+outs:
+  - md5: 5fb032d2de430f79f25e1bdf1259c9bf
+    size: 35764784
+    path: world_bank_pip.csv
diff --git a/snapshots/wb/2024-03-27/world_bank_pip.py b/snapshots/wb/2024-03-27/world_bank_pip.py
new file mode 100644
index 00000000000..c3db74c58bc
--- /dev/null
+++ b/snapshots/wb/2024-03-27/world_bank_pip.py
@@ -0,0 +1,36 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option(
+    "--upload/--skip-upload",
+    default=True,
+    type=bool,
+    help="Upload dataset to Snapshot",
+)
+@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.")
+def main(path_to_file: str, upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"wb/{SNAPSHOT_VERSION}/world_bank_pip.csv")
+
+    # Ensure destination folder exists.
+    snap.path.parent.mkdir(exist_ok=True, parents=True)
+
+    # Copy local data file to snapshots data folder.
+    snap.path.write_bytes(Path(path_to_file).read_bytes())
+
+    # Add file to DVC and upload to S3.
+    snap.dvc_add(upload=upload)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
new file mode 100644
index 00000000000..d7c1982d021
--- /dev/null
+++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
@@ -0,0 +1,33 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: 'World Bank Poverty and Inequality Platform (PIP): Percentiles'
+    description: |-
+      The Poverty and Inequality Platform: Percentiles database reports 100 points ranked according to the consumption or income distributions for country-year survey data available in the World Bank’s Poverty and Inequality Platform (PIP). There are, as of March 26, 2024, a total of 2,367 country-survey-year data points, which include 2,201 distributions based on microdata or binned data, and 166 based on grouped data. For the grouped data, the percentiles are derived by fitting a parametric Lorenz distribution following Datt (1998). For ease of communication, all distributions are referred to as survey data henceforth, and the welfare variable is referred to as income.
+
+      We modified the original files available in [World Bank's Databank](https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles) to include distributions from missing countries and regions with data available in PIP's API.
+    date_published: 2024-04-08
+    version_producer: Version 10
+
+    # Citation
+    producer: World Bank Poverty and Inequality Platform
+    citation_full: |-
+      - World Bank (2024). Poverty and Inequality Platform: percentiles [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed 09 April 2024.
+      - World Bank (2024). Poverty and Inequality Platform (version 20240326_2017 and 20240326_2011) [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed March 27, 2024.
+
+    # Files
+    url_main: https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles
+    date_accessed: 2024-04-09
+
+    # License
+    license:
+      name: CC0
+      url: https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles
+
+wdir: ../../../data/snapshots/wb/2024-01-17
+outs:
+  - md5: f5bb53372a6fd0f563d20d04b3c897c7
+    size: 49972432
+    path: world_bank_pip_percentiles.csv
diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py
new file mode 100644
index 00000000000..17eb2bd88e3
--- /dev/null
+++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py
@@ -0,0 +1,25 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.")
+def main(path_to_file: str, upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"wb/{SNAPSHOT_VERSION}/world_bank_pip_percentiles.csv")
+
+    # Copy local data file to snapshots data folder, add file to DVC and upload to S3.
+    snap.create_snapshot(filename=path_to_file, upload=upload)
+
+
+if __name__ == "__main__":
+    main()

From ce3db21db3ad8147f2492034ab089f34413d233f Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Tue, 16 Apr 2024 04:01:59 +0000
Subject: [PATCH 40/61] :robot: automatic excess mortality update

---
 snapshots/climate/latest/weekly_wildfires.csv.dvc     | 11 ++++-------
 snapshots/excess_mortality/latest/hmd_stmf.csv.dvc    |  8 ++++----
 snapshots/excess_mortality/latest/wmd.csv.dvc         |  2 +-
 .../latest/xm_karlinsky_kobak.csv.dvc                 |  2 +-
 .../latest/xm_karlinsky_kobak_ages.csv.dvc            |  2 +-
 5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
index 1e9fbe61c30..d767fe6c0d4 100644
--- a/snapshots/climate/latest/weekly_wildfires.csv.dvc
+++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -5,16 +5,13 @@ meta:
     description: |-
       The dataset provides a weekly comprehensive overview of fire activity and its environmental impact, incorporating data from the Global Wildfire Information System (GWIS) and satellite imagery from MODIS and VIIRS. It includes metrics such as the area of land burnt, cumulative burnt areas, carbon dioxide emissions from fires, cumulative carbon emissions, the number of fires, and cumulative fire counts.
     title_snapshot: Seasonal wildfire trends (2024 and later)
-    description_snapshot: This dataset focuses specifically on older data. A separate snapshot will be created to add more recent data.
+    description_snapshot: This dataset focuses specifically on older data. A separate snapshot will be created to add more
+      recent data.
     citation_full: Global Wildfire Information System
     attribution_short: GWIS
     url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend
-    date_accessed: 2024-04-15
-    date_published: 2024-04-15
+    date_accessed: 2024-04-16
+    date_published: 2024-04-16
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
-outs:
-  - md5: 06757d4e2324d884c119b0a8c419e896
-    size: 11650883
-    path: weekly_wildfires.csv
diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
index e17ec668748..792d0249d31 100644
--- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
+++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,8 +13,8 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-04-15
-    publication_date: 2024-03-18
+    date_accessed: 2024-04-16
+    publication_date: 2024-04-15
     publication_year: 2024
     published_by: |-
       HMD. Human Mortality Database. Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available at www.mortality.org.
@@ -33,6 +33,6 @@ meta:
     name: Creative Commons BY 4.0
     url: https://www.mortality.org/Data/UserAgreement
 outs:
-  - md5: 486998231b386472ce10076bc0bb9267
-    size: 20580398
+  - md5: 862fd3a084100dc88101927fb51a216d
+    size: 20643551
     path: hmd_stmf.csv
diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
index cc60e04075b..2d5a545076e 100644
--- a/snapshots/excess_mortality/latest/wmd.csv.dvc
+++ b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-04-15
+    date_accessed: 2024-04-16
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
index 3624b44c904..63f22023dde 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-04-15
+    date_accessed: 2024-04-16
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
index e5bf7ad7629..176b2188ee4 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-04-15
+    date_accessed: 2024-04-16
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

From affcd1303065a6f2eed52410d06cae8c452567dc Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Tue, 16 Apr 2024 04:03:41 +0000
Subject: [PATCH 41/61] :robot: automatic flunet update

---
 snapshots/who/latest/fluid.csv.dvc  | 4 ++--
 snapshots/who/latest/flunet.csv.dvc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
index b688b891b6e..7b7183d377c 100644
--- a/snapshots/who/latest/fluid.csv.dvc
+++ b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 81d668993ca1dba5c2dd9feeb5b82218
-    size: 150812561
+  - md5: 09fc6d25ff6263883268a4af7ad5b43f
+    size: 150817870
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
index 059c1a79e02..221e5db9feb 100644
--- a/snapshots/who/latest/flunet.csv.dvc
+++ b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: c330a107ff283f6862d1775b81b2a3bf
-    size: 25796571
+  - md5: 1cd9f9e33c60e711bd3ce852d8ed9a95
+    size: 25798090
     path: flunet.csv

From 99ef0d68f6720cfb6d308292a0681038411cc59f Mon Sep 17 00:00:00 2001
From: Marigold <mojmir.vinkler@gmail.com>
Date: Tue, 16 Apr 2024 08:05:03 +0200
Subject: [PATCH 42/61] :bug: save wildfires snapshot only after successful
 download

---
 snapshots/climate/latest/weekly_wildfires.csv.dvc | 4 ++++
 snapshots/climate/latest/weekly_wildfires.py      | 6 +++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc
index d767fe6c0d4..374bbe18307 100644
--- a/snapshots/climate/latest/weekly_wildfires.csv.dvc
+++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc
@@ -15,3 +15,7 @@ meta:
     license:
       name: CC BY 4.0
       url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license
+outs:
+  - md5: 06757d4e2324d884c119b0a8c419e896
+    size: 11650883
+    path: weekly_wildfires.csv
diff --git a/snapshots/climate/latest/weekly_wildfires.py b/snapshots/climate/latest/weekly_wildfires.py
index 4904b959a81..8755e613afa 100644
--- a/snapshots/climate/latest/weekly_wildfires.py
+++ b/snapshots/climate/latest/weekly_wildfires.py
@@ -50,9 +50,6 @@ def main(upload: bool) -> None:
     # Initialize a new snapshot object for storing data, using a predefined file path structure.
     snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/weekly_wildfires.csv")
 
-    # Add date_accessed
-    snap = modify_metadata(snap)
-
     # Initialize an empty list to hold DataFrames for wildfire data.
     dfs_fires = []
 
@@ -170,6 +167,9 @@ def main(upload: bool) -> None:
     # Save the final DataFrame to the specified file path in the snapshot.
     df_to_file(df_final, file_path=snap.path)
 
+    # Add date_accessed
+    snap = modify_metadata(snap)
+
     # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter.
     snap.dvc_add(upload=upload)
 

From ec135789fcc56d1b2b4929d48f13a3f791a516d7 Mon Sep 17 00:00:00 2001
From: Pablo Rosado <pabloarosado@gmail.com>
Date: Tue, 16 Apr 2024 17:59:27 +0200
Subject: [PATCH 43/61] Fix concat issue in new pandas version (#2527)

---
 lib/catalog/owid/catalog/tables.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py
index db04e2e914a..150d44922cd 100644
--- a/lib/catalog/owid/catalog/tables.py
+++ b/lib/catalog/owid/catalog/tables.py
@@ -1215,6 +1215,26 @@ def concat(
                 **kwargs,
             )
         )
+        ################################################################################################################
+        # In pandas 2.2.1, pd.concat() does not return a copy when one of the input dataframes is empty.
+        # This causes the following unexpected behavior:
+        # df_0 = pd.DataFrame({"a": ["original value"]})
+        # df_1 = pd.concat([pd.DataFrame(), df_0], ignore_index=True)
+        # df_0.loc[:, "a"] = "new value"
+        # df_1["a"]  # This will return "new value" instead of "original value".
+        # In pandas `1.4.0`, the behavior was as expected (returning "original value").
+        # Note that this happens even if `copy=True` is passed to `pd.concat()`.
+        if any([len(obj) == 0 for obj in objs]):
+            if pd.__version__ != "2.2.1":
+                # Check if patch is no longer needed.
+                df_0 = pd.DataFrame({"a": ["original value"]})
+                df_1 = pd.concat([pd.DataFrame(), df_0], ignore_index=True)
+                df_0.loc[:, "a"] = "new value"
+                if df_1["a"].item() != "new value":
+                    log.warning("Remove patch in owid.catalog.tables.concat, which is no longer necessary.")
+            # Ensure concat returns a copy.
+            table = table.copy()
+        ################################################################################################################
 
     if (axis == 1) or (axis == "columns"):
         # Original function pd.concat allows returning a dataframe with multiple columns with the same name.

From 86d0712f748d56e17ddc5c1333955e8c8ae2fcd5 Mon Sep 17 00:00:00 2001
From: Fiona Spooner <fiona@ourworldindata.org>
Date: Tue, 16 Apr 2024 17:55:29 +0100
Subject: [PATCH 44/61] =?UTF-8?q?=E2=9C=A8=20=20Adding=20WHO=20regions=20t?=
 =?UTF-8?q?o=20regions.yml=20(#2525)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* adding WHO regions

* changing region type to who_regiona

* changing who_region to aggregate

* Fix bad region definition, and ensure is_historical is boolean in regions garden step

* only keep owid defined regions

* adding pablo's suggestions

---------

Co-authored-by: Pablo Rosado <pabloarosado@gmail.com>
Co-authored-by: Marigold <mojmir.vinkler@gmail.com>
---
 .../2023-06-14/ai_national_strategy.py        |   4 +-
 .../data/garden/regions/2023-01-01/regions.py |   6 +
 .../garden/regions/2023-01-01/regions.yml     | 224 ++++++++++++++++++
 3 files changed, 233 insertions(+), 1 deletion(-)

diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
index ad708f5e3cb..913118d3c60 100644
--- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
+++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py
@@ -21,7 +21,9 @@ def run(dest_dir: str) -> None:
     ds_meadow = cast(Dataset, paths.load_dependency("ai_national_strategy"))
     # Load region dataset to find all possible countries and later fill the ones that don't exist in the spreadsheet as not released (according to source that's the implication)
     ds_regions = cast(Dataset, paths.load_dependency("regions"))
-    countries_national_ai = pd.DataFrame(ds_regions["regions"]["name"])
+    tb_regions = ds_regions["regions"]
+    tb_regions = tb_regions[tb_regions["defined_by"] == "owid"]
+    countries_national_ai = pd.DataFrame(tb_regions["name"])
     countries_national_ai.reset_index(drop=True, inplace=True)
     countries_national_ai["released"] = np.NaN
     # Generate the column names from "2017" to "2022"
diff --git a/etl/steps/data/garden/regions/2023-01-01/regions.py b/etl/steps/data/garden/regions/2023-01-01/regions.py
index 1fb987a4b4a..086ea166032 100644
--- a/etl/steps/data/garden/regions/2023-01-01/regions.py
+++ b/etl/steps/data/garden/regions/2023-01-01/regions.py
@@ -53,6 +53,9 @@ def parse_raw_definitions(df: pd.DataFrame) -> pd.DataFrame:
 
 
 def run_sanity_checks(df: pd.DataFrame) -> None:
+    # Check that all regions have a name.
+    assert df[df["name"].isnull()].empty, f"Some regions do not have a name: {set(df[df['name'].isnull()]['code'])}"
+
     # Check that there are no repeated codes.
     duplicated_codes = df[df["code"].duplicated()]["code"].tolist()
     assert len(duplicated_codes) == 0, f"Duplicated codes found: {duplicated_codes}"
@@ -125,6 +128,9 @@ def run(dest_dir: str) -> None:
             lambda x: json.dumps(sum(list(x), [])) if pd.notna(x.values) else x
         )
 
+    # Ensure "is_historical" is boolean.
+    tb_regions = tb_regions.astype({"is_historical": bool})
+
     # Set an appropriate index and sort conveniently.
     tb_regions = tb_regions.set_index("code", verify_integrity=True).sort_index()
 
diff --git a/etl/steps/data/garden/regions/2023-01-01/regions.yml b/etl/steps/data/garden/regions/2023-01-01/regions.yml
index 9c48b522abd..2239ada2813 100644
--- a/etl/steps/data/garden/regions/2023-01-01/regions.yml
+++ b/etl/steps/data/garden/regions/2023-01-01/regions.yml
@@ -1847,3 +1847,227 @@
   end_year: 1902
   successors:
     - "ZAF"
+
+# WHO regions
+- code: WHO_AMR
+  name: "Americas (WHO)"
+  region_type: "aggregate"
+  defined_by: who
+  members:
+    - "ATG"
+    - "ARG"
+    - "BHS"
+    - "BRB"
+    - "BOL"
+    - "BRA"
+    - "CAN"
+    - "CHL"
+    - "COL"
+    - "CRI"
+    - "CUB"
+    - "DMA"
+    - "DOM"
+    - "ECU"
+    - "SLV"
+    - "GRD"
+    - "GTM"
+    - "HTI"
+    - "HND"
+    - "JAM"
+    - "MEX"
+    - "NIC"
+    - "PAN"
+    - "PRY"
+    - "PER"
+    - "KNA"
+    - "LCA"
+    - "SUR"
+    - "TTO"
+    - "USA"
+    - "URY"
+    - "VEN"
+
+- code: WHO_AFR
+  name: "Africa (WHO)"
+  region_type: "aggregate"
+  defined_by: who
+  members:
+    - "DZA"
+    - "AGO"
+    - "BEN"
+    - "BWA"
+    - "BFA"
+    - "BDI"
+    - "CMR"
+    - "CPV"
+    - "CAF"
+    - "TCD"
+    - "COM"
+    - "COG"
+    - "CIV"
+    - "COD"
+    - "GNQ"
+    - "ERI"
+    - "SWZ"
+    - "ETH"
+    - "GAB"
+    - "GMB"
+    - "GHA"
+    - "GIN"
+    - "GNB"
+    - "KEN"
+    - "LSO"
+    - "LBR"
+    - "MDG"
+    - "MWI"
+    - "MLI"
+    - "MRT"
+    - "MUS"
+    - "MOZ"
+    - "NAM"
+    - "NER"
+    - "NGA"
+    - "RWA"
+    - "STP"
+    - "SEN"
+    - "SYC"
+    - "SLE"
+    - "ZAF"
+    - "SSD"
+    - "TZA"
+    - "TGO"
+    - "UGA"
+    - "ZMB"
+    - "ZWE"
+- code: WHO_EMR
+  name: "Eastern Mediterranean (WHO)"
+  region_type: "aggregate"
+  defined_by: who
+  members:
+    - "AFG"
+    - "BHR"
+    - "DJI"
+    - "EGY"
+    - "IRN"
+    - "IRQ"
+    - "JOR"
+    - "KWT"
+    - "LBN"
+    - "LBY"
+    - "MAR"
+    - "OMN"
+    - "PAK"
+    - "QAT"
+    - "SAU"
+    - "SOM"
+    - "SDN"
+    - "SYR"
+    - "TUN"
+    - "ARE"
+    - "YEM"
+- code: WHO_EUR
+  name: "Europe (WHO)"
+  region_type: "aggregate"
+  defined_by: who
+  members:
+    - "ALB"
+    - "AND"
+    - "ARM"
+    - "AUT"
+    - "AZE"
+    - "BLR"
+    - "BEL"
+    - "BIH"
+    - "BGR"
+    - "HRV"
+    - "CYP"
+    - "CZE"
+    - "DNK"
+    - "EST"
+    - "FIN"
+    - "FRA"
+    - "GEO"
+    - "DEU"
+    - "GRC"
+    - "HUN"
+    - "ISL"
+    - "IRL"
+    - "ISR"
+    - "ITA"
+    - "KAZ"
+    - "KGZ"
+    - "LVA"
+    - "LTU"
+    - "LUX"
+    - "MLT"
+    - "MDA"
+    - "MCO"
+    - "MNE"
+    - "NLD"
+    - "MKD"
+    - "NOR"
+    - "POL"
+    - "PRT"
+    - "ROU"
+    - "RUS"
+    - "SMR"
+    - "SRB"
+    - "SVK"
+    - "SVN"
+    - "ESP"
+    - "SWE"
+    - "CHE"
+    - "TJK"
+    - "TUR"
+    - "TKM"
+    - "UKR"
+    - "GBR"
+    - "UZB"
+- code: WHO_SEAR
+  name: "South-East Asia (WHO)"
+  region_type: "aggregate"
+  defined_by: who
+  members:
+    - "BGD"
+    - "BTN"
+    - "PRK"
+    - "IND"
+    - "IDN"
+    - "MDV"
+    - "MMR"
+    - "NPL"
+    - "THA"
+    - "TLS"
+    - "LKA"
+- code: WHO_WPAC
+  name: "Western Pacific (WHO)"
+  region_type: "aggregate"
+  defined_by: who
+  members:
+    - "AUS"
+    - "BRN"
+    - "KHM"
+    - "CHN"
+    - "COK"
+    - "FJI"
+    - "JPN"
+    - "KIR"
+    - "LAO"
+    - "MYS"
+    - "MHL"
+    - "FSM"
+    - "MNG"
+    - "NRU"
+    - "NZL"
+    - "NIU"
+    - "PLW"
+    - "PNG"
+    - "PHL"
+    - "WSM"
+    - "SGP"
+    - "SLB"
+    - "KOR"
+    - "TON"
+    - "TUV"
+    - "VUT"
+    - "VNM"

From 01dba805763651b12d917ebfa5374ce72b57a994 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Tue, 16 Apr 2024 18:58:10 +0200
Subject: [PATCH 45/61] :sparkles: compute checksums from ingredients only
 (#2514)

* :sparkles: compute checksums from ingredients only
---
 apps/owidbot/etldiff.py                                |  5 ++++-
 etl/steps/__init__.py                                  | 10 +++-------
 .../garden/nasa/2023-03-06/ozone_hole_area.meta.yml    |  3 ---
 .../war/2023-01-18/dunnigan_martel_1987.meta.yml       |  1 -
 .../data/garden/war/2023-01-18/eckhardt_1991.meta.yml  |  1 -
 .../data/garden/war/2023-01-18/kaye_1985.meta.yml      |  1 -
 .../data/garden/war/2023-01-18/sutton_1971.meta.yml    |  4 ----
 7 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py
index a721b43bd0f..504e53c48b6 100644
--- a/apps/owidbot/etldiff.py
+++ b/apps/owidbot/etldiff.py
@@ -55,6 +55,10 @@ def cli(
 
     nbranch = _normalise_branch(branch) if branch else "dry-run"
 
+    # TODO: only include site-screenshots if the PR is from owid-grapher. Similarly, don't
+    # run etl diff if the PR is from etl repo.
+    # - **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch}
+
     body = f"""
 <details>
 
@@ -63,7 +67,6 @@ def cli(
 - **Admin**: http://staging-site-{nbranch}/admin/login
 - **Site**: http://staging-site-{nbranch}/
 - **Login**: `ssh owid@staging-site-{nbranch}`
-- **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch}
 </details>
 
 <details>
diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py
index 652cd42ac0a..b47f18ea2e8 100644
--- a/etl/steps/__init__.py
+++ b/etl/steps/__init__.py
@@ -518,7 +518,8 @@ def _output_dataset(self) -> catalog.Dataset:
         return catalog.Dataset(self._dest_dir.as_posix())
 
     def checksum_output(self) -> str:
-        return self._output_dataset.checksum()
+        # output checksum is checksum of all ingredients
+        return self.checksum_input()
 
     def _step_files(self) -> List[str]:
         "Return a list of code files defining this step."
@@ -714,12 +715,7 @@ def has_existing_data(self) -> bool:
         return True
 
     def checksum_output(self) -> str:
-        # NOTE: we could use the checksum from `_dvc_path` to
-        # speed this up. Test the performance on
-        # time poetry run etl run garden --dry-run
-        # Make sure that the checksum below is the same as DVC checksum! It
-        # looks like it might be different for some reason
-        return files.checksum_file(self._dvc_path)
+        return Snapshot(self.path).m.outs[0]["md5"]
 
     @property
     def _dvc_path(self) -> str:
diff --git a/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml b/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml
index dc0290db97b..7ccf28e173a 100644
--- a/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml
+++ b/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml
@@ -16,9 +16,6 @@ dataset:
     Minimum and mean Southern Hemisphere daily ozone concentrations, measured in Dobson Units (DU).
 
     This dataset should be next updated by the source every year. We will update it on Our World in Data soon after the new version is published. At the link above you can directly access the source page and see the latest available data.
-  licenses:
-    - name: # TO BE FILLED. Example: Testing License Name
-      url: # TO BE FILLED. Example: https://url_of_testing_source.com/license
   sources:
     - *source-testing
 
diff --git a/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml b/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml
index 03e7ade1190..d4722538299 100644
--- a/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml
+++ b/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml
@@ -17,7 +17,6 @@ dataset:
     This dataset provides information on military and civilian deaths from wars, drawn from the book by Dunnigan and Martel (1987).
   licenses:
     - name: Doubleday (1987)
-      url: # TO BE FILLED. Example: https://url_of_testing_source.com/license
   sources:
     - *source-testing
 
diff --git a/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml b/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml
index 790bc299ceb..7f77a80f561 100644
--- a/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml
+++ b/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml
@@ -17,7 +17,6 @@ dataset:
     This dataset provides information on military and civilian deaths from wars, drawn from the chapter by Eckhardt (1991).
   licenses:
     - name: World Priorities
-      url: # TO BE FILLED. Example: https://url_of_testing_source.com/license
   sources:
     - *source-testing
 
diff --git a/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml b/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml
index 2f4f3db8fed..3ca1fb6a42a 100644
--- a/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml
+++ b/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml
@@ -17,7 +17,6 @@ dataset:
     This dataset provides information on direct and indirect military and civilian deaths from major armed conflicts, drawn from the report by Kaye et al. (1985).
   licenses:
     - name: Department of National Defence, Canada, Operational Research and Analysis Establishment, 1985
-      url: # TO BE FILLED. Example: https://url_of_testing_source.com/license
   sources:
     - *source-testing
 
diff --git a/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml b/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml
index 6a12a6987e2..540549a7a8a 100644
--- a/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml
+++ b/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml
@@ -4,7 +4,6 @@ all_sources:
       published_by: Sutton, Antony. 1972. Wars and Revolutions in the Nineteenth Century. Hoover Institution Archives.
       url: https://searchworks.stanford.edu/view/3023823
       date_accessed: 2023-01-09
-      publication_date: # TO BE FILLED. Example: 2023-01-01
       publication_year: 1971
       # description: Source description.
 
@@ -15,9 +14,6 @@ dataset:
   version: 2023-01-18
   description: |
     This dataset provides information on deaths from wars and revolutions, using data from Sutton (1972).
-  licenses:
-    - name: Unknown
-      url: # TO BE FILLED. Example: https://url_of_testing_source.com/license
   sources:
     - *source-testing
 

From 9059031496f2a19a4c28ebd6460201bd9db5af03 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Tue, 16 Apr 2024 23:16:24 +0200
Subject: [PATCH 46/61] :sparkles: add command sync.catalog for prefetching
 catalog from R2 to local (#2526)

---
 Makefile | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Makefile b/Makefile
index 4df3585ed72..376ff4272f1 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,7 @@ help:
 	@echo '  make format-all 	Format code (including modules in lib/)'
 	@echo '  make full      	Fetch all data and run full transformations'
 	@echo '  make grapher   	Publish supported datasets to Grapher'
+	@echo '  make sync.catalog  Sync catalog from R2 into local data/ folder'
 	@echo '  make lab       	Start a Jupyter Lab server'
 	@echo '  make publish   	Publish the generated catalog to S3'
 	@echo '  make api   		Start the ETL API on port 8081'
@@ -118,6 +119,14 @@ prune: .venv
 	@echo '==> Prune datasets with no recipe from catalog'
 	poetry run etl d prune
 
+# Syncing catalog is useful if you want to avoid rebuilding it locally from scratch
+# which could take a few hours. This will download ~10gb from the main channels
+# (meadow, garden, open_numbers) and is especially useful when we increase ETL_EPOCH
+# or update regions.
+sync.catalog: .venv
+	@echo '==> Sync catalog from R2 into local data/ folder (~10gb)'
+	rclone sync owid-r2:owid-catalog/ data/ --verbose --fast-list --transfers=64 --checkers=64 --include "/meadow/**" --include "/garden/**" --include "/open_numbers/**"
+
 grapher: .venv
 	@echo '==> Running full etl with grapher upsert'
 	poetry run etl run --grapher

From 3f964929cff3a4242272ea187bce714049399bd8 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 17 Apr 2024 04:02:35 +0000
Subject: [PATCH 47/61] :robot: automatic excess mortality update

---
 snapshots/excess_mortality/latest/hmd_stmf.csv.dvc              | 2 +-
 snapshots/excess_mortality/latest/wmd.csv.dvc                   | 2 +-
 snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc    | 2 +-
 .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
index 792d0249d31..d09a7442446 100644
--- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
+++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc
@@ -13,7 +13,7 @@ meta:
       HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality).
     url: https://www.mortality.org/Data/STMF
     source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv
-    date_accessed: 2024-04-16
+    date_accessed: 2024-04-17
     publication_date: 2024-04-15
     publication_year: 2024
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc
index 2d5a545076e..65a2e1fe5e6 100644
--- a/snapshots/excess_mortality/latest/wmd.csv.dvc
+++ b/snapshots/excess_mortality/latest/wmd.csv.dvc
@@ -13,7 +13,7 @@ meta:
       Published paper available at https://elifesciences.org/articles/69336.
     url: https://github.com/akarlinsky/world_mortality/
     source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv
-    date_accessed: 2024-04-16
+    date_accessed: 2024-04-17
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
index 63f22023dde..07c24658aec 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc
@@ -7,7 +7,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv
-    date_accessed: 2024-04-16
+    date_accessed: 2024-04-17
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-
diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
index 176b2188ee4..2b4a5acb103 100644
--- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
+++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc
@@ -6,7 +6,7 @@ meta:
       For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic.
     url: https://github.com/dkobak/excess-mortality
     source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv
-    date_accessed: 2024-04-16
+    date_accessed: 2024-04-17
     publication_date: '2021-06-30'
     publication_year: 2021
     published_by: |-

From 43ae61173e73ad66030d911f7a5a65878a6eba98 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 17 Apr 2024 04:04:06 +0000
Subject: [PATCH 48/61] :robot: automatic flunet update

---
 snapshots/who/latest/fluid.csv.dvc  | 4 ++--
 snapshots/who/latest/flunet.csv.dvc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc
index 7b7183d377c..a2a7051f812 100644
--- a/snapshots/who/latest/fluid.csv.dvc
+++ b/snapshots/who/latest/fluid.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 09fc6d25ff6263883268a4af7ad5b43f
-    size: 150817870
+  - md5: e8ddbf642ef6d60c536db353c62147f4
+    size: 150907130
     path: fluid.csv
diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc
index 221e5db9feb..1c40102dcf4 100644
--- a/snapshots/who/latest/flunet.csv.dvc
+++ b/snapshots/who/latest/flunet.csv.dvc
@@ -16,6 +16,6 @@ meta:
     The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases.
 wdir: ../../../data/snapshots/who/latest
 outs:
-  - md5: 1cd9f9e33c60e711bd3ce852d8ed9a95
-    size: 25798090
+  - md5: 0221c18fd18fdfe7b4510a3fea61459f
+    size: 25800809
     path: flunet.csv

From 092a1778feb5d666e75c58635da044ef4584f944 Mon Sep 17 00:00:00 2001
From: Fiona Spooner <fiona@ourworldindata.org>
Date: Wed, 17 Apr 2024 08:12:43 +0100
Subject: [PATCH 49/61] Update who.meta.yml (#2529)

---
 etl/steps/data/garden/wash/2024-01-06/who.meta.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/etl/steps/data/garden/wash/2024-01-06/who.meta.yml b/etl/steps/data/garden/wash/2024-01-06/who.meta.yml
index 8feb8a86f07..4e5167f58b9 100644
--- a/etl/steps/data/garden/wash/2024-01-06/who.meta.yml
+++ b/etl/steps/data/garden/wash/2024-01-06/who.meta.yml
@@ -25,7 +25,7 @@ definitions:
   safely_managed_sanitation_desc: &safely_managed_sanitation_desc |
     Safely managed sanitation services are defined as improved sanitation facilities that are not shared with other households and where excreta are safely disposed in situ or transported and treated off-site.
   basic_drinking_water_desc: &basic_drinking_water_desc |
-    Basic drinking water services are defined as an improved drinking water source,provided collection time is not more than 30 minutes for a roundtrip including queuing.
+    Basic drinking water services are defined as an improved drinking water source, provided collection time is not more than 30 minutes for a roundtrip including queuing.
   limited_drinking_water_desc: &limited_drinking_water_desc |
     Limited drinking water services are defined as drinking water from an improved source for which collection time exceeds 30 minutes for a roundtrip including queuing.
   improved_drinking_water_desc: &improved_drinking_water_desc |

From 2e72a3ab35a51f7f4b8eb4a8090df77a3ce32235 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Wed, 17 Apr 2024 13:36:19 +0200
Subject: [PATCH 50/61] :bug: reuse engine when deleting ghost variables
 (#2530)

* :bug: reuse engine when deleting ghost variables

* :bug: fix creating missing entities

* :hammer: deprecate DBUtils

* :bug: fix container name
---
 apps/backport/datasync/data_metadata.py |   8 +-
 apps/owidbot/etldiff.py                 |  20 ++--
 etl/chart_revision/v1/deprecated.py     |  50 ++++-----
 etl/chart_revision/v1/revision.py       |  30 ++---
 etl/db.py                               |  29 -----
 etl/db_utils.py                         | 142 ------------------------
 etl/grapher_helpers.py                  |  52 +++++----
 etl/grapher_import.py                   |  41 ++++---
 etl/steps/__init__.py                   |   7 +-
 9 files changed, 116 insertions(+), 263 deletions(-)
 delete mode 100644 etl/db_utils.py

diff --git a/apps/backport/datasync/data_metadata.py b/apps/backport/datasync/data_metadata.py
index 190a6d32a85..f1d4fe3e725 100644
--- a/apps/backport/datasync/data_metadata.py
+++ b/apps/backport/datasync/data_metadata.py
@@ -83,7 +83,13 @@ def add_entity_code_and_name(session: Session, df: pd.DataFrame) -> pd.DataFrame
         df["entityCode"] = []
         return df
 
-    entities = _fetch_entities(session, list(df["entityId"].unique()))
+    unique_entities = df["entityId"].unique()
+
+    entities = _fetch_entities(session, list(unique_entities))
+
+    if set(unique_entities) - set(entities.entityId):
+        missing_entities = set(unique_entities) - set(entities.entityId)
+        raise ValueError(f"Missing entities in the database: {missing_entities}")
 
     return pd.merge(df, entities, on="entityId")
 
diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py
index 504e53c48b6..266bcd816a6 100644
--- a/apps/owidbot/etldiff.py
+++ b/apps/owidbot/etldiff.py
@@ -1,6 +1,6 @@
 import datetime as dt
-import re
 import subprocess
+import time
 from typing import Tuple
 
 import click
@@ -10,7 +10,7 @@
 from rich.ansi import AnsiDecoder
 from rich_click.rich_command import RichCommand
 
-from apps.staging_sync.cli import _normalise_branch
+from apps.staging_sync.cli import _get_container_name
 from etl import config
 from etl.paths import BASE_DIR
 
@@ -50,10 +50,12 @@ def cli(
     $ python apps/owidbot/etldiff.py --branch my-branch
     ```
     """
+    t = time.time()
+
     lines = call_etl_diff(include)
     diff, result = format_etl_diff(lines)
 
-    nbranch = _normalise_branch(branch) if branch else "dry-run"
+    container_name = _get_container_name(branch) if branch else "dry-run"
 
     # TODO: only include site-screenshots if the PR is from owid-grapher. Similarly, don't
     # run etl diff if the PR is from etl repo.
@@ -64,9 +66,9 @@ def cli(
 
 <summary><b>Staging server</b>: </summary>
 
-- **Admin**: http://staging-site-{nbranch}/admin/login
-- **Site**: http://staging-site-{nbranch}/
-- **Login**: `ssh owid@staging-site-{nbranch}`
+- **Admin**: http://{container_name}/admin/login
+- **Site**: http://{container_name}/
+- **Login**: `ssh owid@{container_name}`
 </details>
 
 <details>
@@ -81,6 +83,7 @@ def cli(
 </details>
 
 _Edited: {dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")} UTC_
+_Execution time: {time.time() - t:.2f} seconds_
     """.strip()
 
     if dry_run:
@@ -142,6 +145,7 @@ def format_etl_diff(lines: list[str]) -> Tuple[str, str]:
 
     diff = "\n".join(new_lines)
 
+    # NOTE: we don't need this anymore, we now have consistent checksums on local and remote
     # Some datasets might have different checksum, but be the same (this is caused by checksum_input and checksum_output
     # problem). Hotfix this by removing matching datasets from the output.
     # Example:
@@ -152,8 +156,8 @@ def format_etl_diff(lines: list[str]) -> Tuple[str, str]:
     #        ~ Column A
     # = Dataset grapher/agriculture/2024-03-26/attainable_yields
     #     = Table attainable_yields
-    pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)"
-    diff = re.sub(pattern, "", diff)
+    # pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)"
+    # diff = re.sub(pattern, "", diff)
 
     return diff, result
 
diff --git a/etl/chart_revision/v1/deprecated.py b/etl/chart_revision/v1/deprecated.py
index fc47931c239..f4159b7f427 100644
--- a/etl/chart_revision/v1/deprecated.py
+++ b/etl/chart_revision/v1/deprecated.py
@@ -21,7 +21,7 @@
 from tqdm import tqdm
 
 from etl.config import DEBUG, GRAPHER_USER_ID
-from etl.db import open_db
+from etl.db import get_engine
 from etl.grapher_helpers import IntRange
 
 log = structlog.get_logger()
@@ -179,23 +179,23 @@ def _get_chart_update_reason(self, variable_ids: List[int]) -> str:
 
         Accesses DB and finds out the name of the recently added dataset with the new variables."""
         try:
-            with open_db() as db:
+            with get_engine().connect() as con:
                 if len(variable_ids) == 1:
-                    results = db.fetch_many(
+                    results = con.execute(
                         f"""
                         SELECT variables.name, datasets.name, datasets.version FROM datasets
                             JOIN variables ON datasets.id = variables.datasetId
                             WHERE variables.id IN ({variable_ids[0]})
                         """
-                    )
+                    ).fetchmany()
                 else:
-                    results = db.fetch_many(
+                    results = con.execute(
                         f"""
                         SELECT variables.name, datasets.name, datasets.version FROM datasets
                             JOIN variables ON datasets.id = variables.datasetId
                             WHERE variables.id IN {*variable_ids,}
                         """
-                    )
+                    ).fetchmany()
         except Exception:
             self.report_error(
                 "Problem found when accessing the DB trying to get details on the newly added variables"
@@ -220,10 +220,10 @@ def _get_chart_update_reason(self, variable_ids: List[int]) -> str:
     def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None:
         n_before = 0
         try:
-            with open_db() as db:
-                n_before = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0]
+            with get_engine().connect() as con:
+                n_before = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0]  # type: ignore
 
-                res = db.fetch_many(
+                res = con.execute(
                     """
                     SELECT *
                     FROM (
@@ -235,7 +235,7 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None:
                         ) as grouped
                     WHERE grouped.c > 1
                 """
-                )
+                ).fetchmany()
                 if len(res):
                     raise RuntimeError(
                         "Two or more suggested chart revisions with status IN "
@@ -267,13 +267,13 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None:
                     VALUES
                         (%s, %s, %s, %s, %s, %s, NOW(), NOW())
                 """
-                db.upsert_many(query, tuples)
+                con.execute(query, tuples)
 
                 # checks if any of the affected chartIds now has multiple
                 # pending suggested revisions. If so, then rejects the whole
                 # insert and tell the user which suggested chart revisions need
                 # to be approved/rejected.
-                res = db.fetch_many(
+                res = con.execute(
                     f"""
                     SELECT id, scr.chartId, c, createdAt
                     FROM (
@@ -291,7 +291,7 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None:
                     WHERE grouped.c > 1
                     ORDER BY createdAt ASC
                 """
-                )
+                ).fetchmany()
                 if len(res):
                     df = pd.DataFrame(res, columns=["id", "chart_id", "count", "created_at"])
                     df["drop"] = df.groupby("chart_id")["created_at"].transform(lambda gp: gp == gp.max())
@@ -321,8 +321,8 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None:
             self.report_error(f"INSERT operation into `suggested_chart_revisions` cancelled. Error: {e}")
             raise e
         finally:
-            with open_db() as db:
-                n_after = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0]
+            with get_engine().connect() as con:
+                n_after = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0]  # type: ignore
 
             self.report_info(
                 f"{n_after - n_before} of {len(suggested_chart_revisions)} suggested chart revisions inserted."
@@ -343,18 +343,18 @@ def _get_charts_from_old_variables(
                 df_chart_dimensions: dataframe of chart_dimensions rows.
                 df_chart_revisions: dataframe of chart_revisions rows.
         """
-        with open_db() as db:
+        with get_engine().connect() as con:
             # retrieves chart_dimensions
             variable_ids = list(self.old_var_id2new_var_id.keys())
             variable_ids_str = ",".join([str(_id) for _id in variable_ids])
             columns = ["id", "chartId", "variableId", "property", "order"]
-            rows = db.fetch_many(
+            rows = con.execute(
                 f"""
                 SELECT {','.join([f'`{col}`' for col in columns])}
                 FROM chart_dimensions
                 WHERE variableId IN ({variable_ids_str})
             """
-            )
+            ).fetchmany()
             df_chart_dimensions = pd.DataFrame(rows, columns=columns)
 
             # retrieves charts
@@ -369,40 +369,40 @@ def _get_charts_from_old_variables(
                 "lastEditedAt",
                 "publishedAt",
             ]
-            rows = db.fetch_many(
+            rows = con.execute(
                 f"""
                 SELECT {','.join(columns)}
                 FROM charts
                 WHERE id IN ({chart_ids_str})
             """
-            )
+            ).fetchmany()
             df_charts = pd.DataFrame(rows, columns=columns)
 
             # retrieves chart_revisions
             columns = ["id", "chartId", "userId", "config", "createdAt", "updatedAt"]
-            rows = db.fetch_many(
+            rows = con.execute(
                 f"""
                 SELECT {','.join(columns)}
                 FROM chart_revisions
                 WHERE chartId IN ({chart_ids_str})
             """
-            )
+            ).fetchmany()
             df_chart_revisions = pd.DataFrame(rows, columns=columns)
         return df_charts, df_chart_dimensions, df_chart_revisions
 
     def _get_variable_year_ranges(self) -> Dict[int, List[int]]:
-        with open_db() as db:
+        with get_engine().connect() as con:
             all_var_ids = list(self.old_var_id2new_var_id.keys()) + list(self.old_var_id2new_var_id.values())
             variable_ids_str = ",".join([str(_id) for _id in all_var_ids])
             raise NotImplementedError("data_values was deprecated")
-            rows = db.fetch_many(
+            rows = con.execute(
                 f"""
                 SELECT variableId, MIN(year) AS minYear, MAX(year) AS maxYear
                 FROM data_values
                 WHERE variableId IN ({variable_ids_str})
                 GROUP BY variableId
             """
-            )
+            ).fetchmany()
             var_id2year_range = {}
             for variable_id, min_year, max_year in rows:
                 var_id2year_range[variable_id] = [min_year, max_year]
diff --git a/etl/chart_revision/v1/revision.py b/etl/chart_revision/v1/revision.py
index 1cbe360c409..0346de1b9ee 100644
--- a/etl/chart_revision/v1/revision.py
+++ b/etl/chart_revision/v1/revision.py
@@ -15,7 +15,7 @@
 from etl.chart_revision.v1.chart import Chart
 from etl.chart_revision.v1.variables import VariablesUpdate
 from etl.config import GRAPHER_USER_ID
-from etl.db import get_engine, open_db
+from etl.db import get_engine
 
 log = get_logger()
 # The maximum length of the suggested revision reason can't exceed the maximum length specified by the datatype "suggestedReason" in grapher.suggested_chart_revisions table.
@@ -341,10 +341,10 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]):
     """Submit chart revisions to Grapher."""
     n_before = 0
     try:
-        with open_db() as db:
-            n_before = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0]
+        with get_engine().connect() as con:
+            n_before = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0]  # type: ignore
 
-            res = db.fetch_many(
+            res = con.execute(
                 """
                 SELECT *
                 FROM (
@@ -356,7 +356,7 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]):
                     ) as grouped
                 WHERE grouped.c > 1
             """
-            )
+            ).fetchmany()
             if len(res):
                 raise RuntimeError(
                     "Two or more suggested chart revisions with status IN "
@@ -387,13 +387,13 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]):
                 VALUES
                     (%s, %s, %s, %s, %s, %s, %s, NOW(), NOW())
             """
-            db.upsert_many(query, tuples)
+            con.execute(query, tuples)
 
             # checks if any of the affected chartIds now has multiple
             # pending suggested revisions. If so, then rejects the whole
             # insert and tell the user which suggested chart revisions need
             # to be approved/rejected.
-            res = db.fetch_many(
+            res = con.execute(
                 f"""
                 SELECT id, scr.chartId, c, createdAt
                 FROM (
@@ -411,7 +411,7 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]):
                 WHERE grouped.c > 1
                 ORDER BY createdAt ASC
             """
-            )
+            ).fetchmany()
             if len(res):
                 df = pd.DataFrame(res, columns=["id", "chart_id", "count", "created_at"])
                 df["drop"] = df.groupby("chart_id")["created_at"].transform(lambda gp: gp == gp.max())
@@ -441,8 +441,8 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]):
         log.info(f"INSERT operation into `suggested_chart_revisions` cancelled. Error: {e}")
         raise e
     finally:
-        with open_db() as db:
-            n_after = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0]
+        with get_engine().connect() as con:
+            n_after = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0]  # type: ignore
 
         log.info(f"{n_after - n_before} of {len(revisions)} suggested chart revisions inserted.")
 
@@ -452,23 +452,23 @@ def _get_chart_update_reason(variable_ids: List[int]) -> str:
 
     Accesses DB and finds out the name of the recently added dataset with the new variables."""
     try:
-        with open_db() as db:
+        with get_engine().connect() as con:
             if len(variable_ids) == 1:
-                results = db.fetch_many(
+                results = con.execute(
                     f"""
                         SELECT variables.name, datasets.name, datasets.version FROM datasets
                             JOIN variables ON datasets.id = variables.datasetId
                             WHERE variables.id IN ({variable_ids[0]})
                         """
-                )
+                ).fetchmany()
             else:
-                results = db.fetch_many(
+                results = con.execute(
                     f"""
                         SELECT variables.name, datasets.name, datasets.version FROM datasets
                             JOIN variables ON datasets.id = variables.datasetId
                             WHERE variables.id IN {*variable_ids,}
                         """
-                )
+                ).fetchmany()
     except Exception:
         log.error(
             "Problem found when accessing the DB trying to get details on the newly added variables"
diff --git a/etl/db.py b/etl/db.py
index dcd13ba0e3c..cf07e1edc05 100644
--- a/etl/db.py
+++ b/etl/db.py
@@ -1,7 +1,4 @@
-import traceback
 import warnings
-from collections.abc import Generator
-from contextlib import contextmanager
 from typing import Any, Dict, List, Optional, cast
 from urllib.parse import quote
 
@@ -14,7 +11,6 @@
 from sqlmodel import Session
 
 from etl import config
-from etl.db_utils import DBUtils
 
 log = structlog.get_logger()
 
@@ -59,31 +55,6 @@ def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine:
     )
 
 
-@contextmanager
-def open_db() -> Generator[DBUtils, None, None]:
-    connection = None
-    cursor = None
-    try:
-        connection = get_connection()
-        connection.autocommit(False)
-        cursor = connection.cursor()
-        yield DBUtils(cursor)
-        connection.commit()
-    except Exception as e:
-        log.error(f"Error encountered during import: {e}")
-        log.error("Rolling back changes...")
-        if connection:
-            connection.rollback()
-        if config.DEBUG:
-            traceback.print_exc()
-        raise e
-    finally:
-        if cursor:
-            cursor.close()
-        if connection:
-            connection.close()
-
-
 def get_dataset_id(
     dataset_name: str, db_conn: Optional[MySQLdb.Connection] = None, version: Optional[str] = None
 ) -> Any:
diff --git a/etl/db_utils.py b/etl/db_utils.py
deleted file mode 100644
index 35213052f3b..00000000000
--- a/etl/db_utils.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""This module was inspired by https://github.com/owid/importers/blob/master/db_utils.py. It is not meant
-to be extended, but slowly replaced by etl/grapher_model.py"""
-
-from typing import Any, Dict, Iterable, List, Optional, Tuple, cast
-
-import structlog
-from MySQLdb import IntegrityError
-from MySQLdb.cursors import Cursor
-from unidecode import unidecode
-
-log = structlog.get_logger()
-
-UNMODIFIED = 0
-INSERT = 1
-UPDATE = 2
-
-
-def normalize_entity_name(entity_name: str) -> str:
-    return unidecode(entity_name.strip())
-
-
-class NotOne(ValueError):
-    pass
-
-
-class DBUtils:
-    def __init__(self, cursor: Cursor):
-        self.cursor = cursor
-        self.entity_id_by_normalised_name: Dict[str, int] = {}
-
-    def get_entity_cache(self) -> Dict[str, int]:
-        return self.entity_id_by_normalised_name
-
-    def fetch_one_or_none(self, *args: Any, **kwargs: Any) -> Any:
-        self.cursor.execute(*args, **kwargs)
-        rows = self.cursor.fetchall()
-        if len(rows) > 1:
-            raise NotOne("Expected 1 or 0 rows but received %d" % (len(rows)))
-        elif len(rows) == 1:
-            return rows[0]
-        else:
-            return None
-
-    def fetch_one(self, *args: Any, **kwargs: Any) -> Any:
-        result = self.fetch_one_or_none(*args, **kwargs)
-        if result is None:
-            raise NotOne("Expected 1 row but received 0")
-        else:
-            return result
-
-    def fetch_many(self, *args: Any, **kwargs: Any) -> List[Any]:
-        self.cursor.execute(*args, **kwargs)
-        return cast(List[Any], self.cursor.fetchall())
-
-    def insert_one(self, *args: Any, **kwargs: Any) -> int:
-        self.cursor.execute(*args, **kwargs)
-        return int(self.cursor.lastrowid)
-
-    def upsert_one(self, *args: Any, **kwargs: Any) -> Optional[int]:
-        self.cursor.execute(*args, **kwargs)
-        if self.cursor.rowcount == 0:
-            return UNMODIFIED
-        if self.cursor.rowcount == 1:
-            return INSERT
-        if self.cursor.rowcount == 2:
-            return UPDATE
-        return None
-
-    def upsert_many(self, query: str, tuples: Iterable[Tuple[Any, ...]]) -> None:
-        self.cursor.executemany(query, list(tuples))
-
-    def execute_until_empty(self, *args: Any, **kwargs: Any) -> None:
-        first = True
-        while first or self.cursor.rowcount > 0:
-            first = False
-            self.cursor.execute(*args, **kwargs)
-
-    def __get_cached_entity_id(self, name: str) -> Optional[int]:
-        normalised_name = normalize_entity_name(name)
-        if normalised_name in self.entity_id_by_normalised_name:
-            return self.entity_id_by_normalised_name[normalised_name]
-        else:
-            return None
-
-    def get_or_create_entity(self, name: str) -> int:
-        # Serve from cache if available
-        entity_id = self.__get_cached_entity_id(name)
-        if entity_id is not None:
-            return entity_id
-        # Populate cache from database
-        self.prefill_entity_cache([name])
-        entity_id = self.__get_cached_entity_id(name)
-        if entity_id is not None:
-            return entity_id
-        # If still not in cache, it's a new entity and we have to insert it
-        else:
-            try:
-                self.upsert_one(
-                    """
-                    INSERT INTO entities
-                        (name, displayName, validated, createdAt, updatedAt)
-                    VALUES
-                        (%s, '', FALSE, NOW(), NOW())
-                """,
-                    [name],
-                )
-            except IntegrityError:
-                # If another process inserted the same entity before us, we can
-                # safely ignore the error and fetch the ID
-                pass
-
-            (entity_id,) = self.fetch_one(
-                """
-                SELECT id FROM entities
-                WHERE name = %s
-            """,
-                [name],
-            )
-            # Cache the newly created entity
-            self.entity_id_by_normalised_name[normalize_entity_name(name)] = entity_id
-            return cast(int, entity_id)
-
-    def prefill_entity_cache(self, names: List[str]) -> None:
-        rows = self.fetch_many(
-            """
-            SELECT
-                name,
-                id
-            FROM entities
-            WHERE
-                entities.name IN %(country_names)s
-            ORDER BY entities.id ASC
-        """,
-            {"country_names": [normalize_entity_name(x) for x in names]},
-        )
-        # Merge the two dicts
-        self.entity_id_by_normalised_name.update(
-            {
-                # entityName → entityId
-                **dict((row[0], row[1]) for row in rows if row[1]),
-            }
-        )
diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py
index ced1c8bf3f1..04c6467cf47 100644
--- a/etl/grapher_helpers.py
+++ b/etl/grapher_helpers.py
@@ -1,5 +1,4 @@
 import copy
-import warnings
 from copy import deepcopy
 from dataclasses import dataclass, field, is_dataclass
 from pathlib import Path
@@ -10,12 +9,12 @@
 import pandas as pd
 import structlog
 from jinja2 import Environment
+from MySQLdb import IntegrityError
 from owid import catalog
 from owid.catalog.utils import underscore
 from sqlalchemy.engine import Engine
 
 from etl.db import get_engine, read_sql
-from etl.db_utils import DBUtils
 from etl.files import checksum_str
 
 log = structlog.get_logger()
@@ -313,16 +312,41 @@ def _get_entities_from_db(
 def _get_and_create_entities_in_db(countries: Set[str], engine: Engine | None = None) -> Dict[str, int]:
     engine = engine or get_engine()
     with engine.connect() as con:
-        cursor = con.connection.cursor()
-        db = DBUtils(cursor)
         log.info("Creating entities in DB", countries=countries)
-        return {name: db.get_or_create_entity(name) for name in countries}
+        out = {}
+        for name in countries:
+            try:
+                con.execute(
+                    """
+                    INSERT INTO entities
+                        (name, displayName, validated, createdAt, updatedAt)
+                    VALUES
+                        (%(name)s, '', FALSE, NOW(), NOW())
+                """,
+                    {"name": name},
+                )
+            except IntegrityError:
+                # If another process inserted the same entity before us, we can
+                # safely ignore the error and fetch the ID
+                pass
+
+            row = con.execute(
+                """
+                SELECT id FROM entities
+                WHERE name = %(name)s
+            """,
+                {"name": name},
+            ).fetchone()
+            assert row
+
+            out[name] = row[0]
+
+    return out
 
 
 def country_to_entity_id(
     country: pd.Series,
     create_entities: bool = False,
-    errors: Literal["raise", "ignore", "warn"] = "raise",
     by: Literal["name", "code"] = "name",
     engine: Engine | None = None,
 ) -> pd.Series:
@@ -347,19 +371,9 @@ def country_to_entity_id(
         # cast to float to fix issues with categories
         entity_id[ix] = country[ix].map(_get_and_create_entities_in_db(set(country[ix]), engine=engine)).astype(float)
 
-    if entity_id.isnull().any():
-        msg = f"Some countries have not been mapped: {set(country[entity_id.isnull()])}"
-        if errors == "raise":
-            raise ValueError(msg)
-        elif errors == "warn":
-            warnings.warn(msg)
-        elif errors == "ignore":
-            pass
-
-        # Int64 allows NaN values
-        return cast(pd.Series, entity_id.astype("Int64"))
-    else:
-        return cast(pd.Series, entity_id.astype(int))
+    assert not entity_id.isnull().any(), f"Some countries have not been mapped: {set(country[entity_id.isnull()])}"
+
+    return cast(pd.Series, entity_id.astype(int))
 
 
 def _unique(x: List[Any]) -> List[Any]:
diff --git a/etl/grapher_import.py b/etl/grapher_import.py
index 346b42a5043..fe4c4fa82a1 100644
--- a/etl/grapher_import.py
+++ b/etl/grapher_import.py
@@ -30,7 +30,6 @@
 )
 from apps.backport.datasync.datasync import upload_gzip_dict
 from etl import config
-from etl.db import open_db
 
 from . import grapher_helpers as gh
 from . import grapher_model as gm
@@ -359,7 +358,7 @@ def set_dataset_checksum_and_editedAt(dataset_id: int, checksum: str) -> None:
         session.commit()
 
 
-def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -> None:
+def cleanup_ghost_variables(engine: Engine, dataset_id: int, upserted_variable_ids: List[int]) -> None:
     """Remove all leftover variables that didn't get upserted into DB during grapher step.
     This could happen when you rename or delete a variable in ETL.
     Raise an error if we try to delete variable used by any chart.
@@ -368,15 +367,14 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -
     :param upserted_variable_ids: variables upserted in grapher step
     :param workers: delete variables in parallel
     """
-    with open_db() as db:
+    with engine.connect() as con:
         # get all those variables first
-        db.cursor.execute(
+        rows = con.execute(
             """
             SELECT id FROM variables WHERE datasetId=%(dataset_id)s AND id NOT IN %(variable_ids)s
         """,
             {"dataset_id": dataset_id, "variable_ids": upserted_variable_ids or [-1]},
-        )
-        rows = db.cursor.fetchall()
+        ).fetchall()
 
         variable_ids_to_delete = [row[0] for row in rows]
 
@@ -387,19 +385,18 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -
         log.info("cleanup_ghost_variables.start", size=len(variable_ids_to_delete))
 
         # raise an exception if they're used in any charts
-        db.cursor.execute(
+        rows = con.execute(
             """
             SELECT chartId, variableId FROM chart_dimensions WHERE variableId IN %(variable_ids)s
         """,
             {"dataset_id": dataset_id, "variable_ids": variable_ids_to_delete},
-        )
-        rows = db.cursor.fetchall()
+        ).fetchall()
         if rows:
             rows = pd.DataFrame(rows, columns=["chartId", "variableId"])
             raise ValueError(f"Variables used in charts will not be deleted automatically:\n{rows}")
 
         # then variables themselves with related data in other tables
-        db.cursor.execute(
+        con.execute(
             """
             DELETE FROM country_latest_data WHERE variable_id IN %(variable_ids)s
         """,
@@ -407,19 +404,19 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -
         )
 
         # delete relationships
-        db.cursor.execute(
+        con.execute(
             """
             DELETE FROM origins_variables WHERE variableId IN %(variable_ids)s
         """,
             {"variable_ids": variable_ids_to_delete},
         )
-        db.cursor.execute(
+        con.execute(
             """
             DELETE FROM tags_variables_topic_tags WHERE variableId IN %(variable_ids)s
         """,
             {"variable_ids": variable_ids_to_delete},
         )
-        db.cursor.execute(
+        con.execute(
             """
             DELETE FROM posts_gdocs_variables_faqs WHERE variableId IN %(variable_ids)s
         """,
@@ -427,7 +424,7 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -
         )
 
         # delete them from explorers
-        db.cursor.execute(
+        con.execute(
             """
             DELETE FROM explorer_variables WHERE variableId IN %(variable_ids)s
         """,
@@ -435,7 +432,7 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -
         )
 
         # finally delete variables
-        db.cursor.execute(
+        result = con.execute(
             """
             DELETE FROM variables WHERE datasetId=%(dataset_id)s AND id IN %(variable_ids)s
         """,
@@ -444,34 +441,34 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -
 
         log.warning(
             "cleanup_ghost_variables.end",
-            size=db.cursor.rowcount,
+            size=result.rowcount,
             variables=variable_ids_to_delete,
         )
 
 
-def cleanup_ghost_sources(dataset_id: int, upserted_source_ids: List[int]) -> None:
+def cleanup_ghost_sources(engine: Engine, dataset_id: int, upserted_source_ids: List[int]) -> None:
     """Remove all leftover sources that didn't get upserted into DB during grapher step.
     This could happen when you rename or delete sources.
     :param dataset_id: ID of the dataset
     :param upserted_source_ids: sources upserted in grapher step
     """
-    with open_db() as db:
+    with engine.connect() as con:
         if upserted_source_ids:
-            db.cursor.execute(
+            result = con.execute(
                 """
                 DELETE FROM sources WHERE datasetId=%(dataset_id)s AND id NOT IN %(source_ids)s
             """,
                 {"dataset_id": dataset_id, "source_ids": upserted_source_ids},
             )
         else:
-            db.cursor.execute(
+            result = con.execute(
                 """
                 DELETE FROM sources WHERE datasetId=%(dataset_id)s
             """,
                 {"dataset_id": dataset_id},
             )
-        if db.cursor.rowcount > 0:
-            log.warning(f"Deleted {db.cursor.rowcount} ghost sources")
+        if result.rowcount > 0:
+            log.warning(f"Deleted {result.rowcount} ghost sources")
 
 
 def _get_entity_name(session: Session, entity_id: int) -> str:
diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py
index b47f18ea2e8..af9ccda8d60 100644
--- a/etl/steps/__init__.py
+++ b/etl/steps/__init__.py
@@ -29,6 +29,7 @@
 from owid.walden import CATALOG as WALDEN_CATALOG
 from owid.walden import Catalog as WaldenCatalog
 from owid.walden import Dataset as WaldenDataset
+from sqlalchemy.engine import Engine
 
 from etl import config, files, git, paths
 from etl import grapher_helpers as gh
@@ -853,7 +854,7 @@ def run(self) -> None:
             variable_upsert_results = [future.result() for future in as_completed(futures)]
 
         if not config.GRAPHER_FILTER and not config.SUBSET:
-            self._cleanup_ghost_resources(dataset_upsert_results, variable_upsert_results)
+            self._cleanup_ghost_resources(engine, dataset_upsert_results, variable_upsert_results)
 
             # set checksum and updatedAt timestamps after all data got inserted
             gi.set_dataset_checksum_and_editedAt(dataset_upsert_results.dataset_id, self.data_step.checksum_input())
@@ -864,6 +865,7 @@ def checksum_output(self) -> str:
     @classmethod
     def _cleanup_ghost_resources(
         cls,
+        engine: Engine,
         dataset_upsert_results,
         variable_upsert_results: List[Any],
     ) -> None:
@@ -882,10 +884,11 @@ def _cleanup_ghost_resources(
         # Try to cleanup ghost variables, but make sure to raise an error if they are used
         # in any chart
         gi.cleanup_ghost_variables(
+            engine,
             dataset_upsert_results.dataset_id,
             upserted_variable_ids,
         )
-        gi.cleanup_ghost_sources(dataset_upsert_results.dataset_id, upserted_source_ids)
+        gi.cleanup_ghost_sources(engine, dataset_upsert_results.dataset_id, upserted_source_ids)
         # TODO: cleanup origins that are not used by any variable
 
 
From 3c918a0bf710e58637c7d7ef104bf9721822f4fe Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 17 Apr 2024 11:44:08 +0000
Subject: [PATCH 51/61] fasttrack: fasttrack/2024-04-17/qubits.csv

---
 dag/fasttrack.yml                             |  2 ++
 .../fasttrack/2024-04-17/qubits.meta.yml      | 16 +++++++++++++
 .../grapher/fasttrack/2024-04-17/qubits.py    | 22 +++++++++++++++++
 snapshots/fasttrack/2024-04-17/qubits.csv.dvc | 24 +++++++++++++++++++
 4 files changed, 64 insertions(+)
 create mode 100644 etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
 create mode 100644 etl/steps/data/grapher/fasttrack/2024-04-17/qubits.py
 create mode 100644 snapshots/fasttrack/2024-04-17/qubits.csv.dvc

diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml
index 8912820bea8..3637725ec67 100644
--- a/dag/fasttrack.yml
+++ b/dag/fasttrack.yml
@@ -158,3 +158,5 @@ steps:
     - snapshot://fasttrack/latest/gpei.csv
   data-private://grapher/fasttrack/latest/conflict_deaths_combined:
     - snapshot-private://fasttrack/latest/conflict_deaths_combined.csv
+  data://grapher/fasttrack/2024-04-17/qubits:
+    - snapshot://fasttrack/2024-04-17/qubits.csv
diff --git a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
new file mode 100644
index 00000000000..dfd2fcf0e76
--- /dev/null
+++ b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
@@ -0,0 +1,16 @@
+dataset:
+  title: Quantum processors over time
+  description: ''
+  licenses:
+    - name: Creative Commons Attribution-ShareAlike 3.0
+      url: https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License
+tables:
+  qubits:
+    variables:
+      qubits:
+        title: Record number of quantum bits per processor
+        unit: qubits
+        short_unit: qb
+        display:
+          numDecimalPlaces: 0
+        description: Highest number of quantum bits in a single circuit-based quantum processor over time
diff --git a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.py b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.py
new file mode 100644
index 00000000000..c25364bd965
--- /dev/null
+++ b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.py
@@ -0,0 +1,22 @@
+from etl.helpers import PathFinder, create_dataset, get_metadata_path
+from etl.snapshot import Snapshot
+
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    # load snapshot
+    snap = Snapshot("fasttrack/2024-04-17/qubits.csv")
+
+    # load data
+    tb = snap.read_csv()
+
+    # add table, update metadata from *.meta.yml and save
+    ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata)
+
+    # override metadata if necessary
+    meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml")
+    if meta_path.exists():
+        ds.update_metadata(meta_path)
+
+    ds.save()
diff --git a/snapshots/fasttrack/2024-04-17/qubits.csv.dvc b/snapshots/fasttrack/2024-04-17/qubits.csv.dvc
new file mode 100644
index 00000000000..93267470810
--- /dev/null
+++ b/snapshots/fasttrack/2024-04-17/qubits.csv.dvc
@@ -0,0 +1,24 @@
+meta:
+  origin:
+    producer: Wikipedia
+    title: List of quantum processors
+    title_snapshot: Circuit-based quantum processors
+    citation_full: Wikipedia, List of quantum processors, Circuit-based quantum processors
+    version_producer: Google Sheet
+    url_main: https://en.wikipedia.org/wiki/List_of_quantum_processors
+    url_download: |-
+      https://docs.google.com/spreadsheets/d/e/2PACX-1vSVB8MqM1U7xLUV68Fd8TUeMiv2jWWGeT8EteyP-0Nvi4getanr9gxxlM0V1JhIlRkhfruB7vjfBTIy/pub?output=csv
+    date_accessed: '2024-04-17'
+    date_published: '2024'
+    license:
+      name: CC BY-SA
+      url: https://en.wikipedia.org/wiki/Wikipedia:Copyrights
+  name: Quantum processors over time
+  description: ''
+  license:
+    name: Creative Commons Attribution-ShareAlike 3.0
+    url: https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License
+outs:
+  - md5: 33a1d150a0261088258fa8b4b4a51a34
+    size: 135
+    path: qubits.csv

From 823bb01741a48ff92ee2f71d1e1ad0625fefba0d Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 17 Apr 2024 12:10:27 +0000
Subject: [PATCH 52/61] fasttrack: fasttrack/2024-04-17/qubits.csv

---
 .../fasttrack/2024-04-17/qubits.meta.yml      | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
index dfd2fcf0e76..40283d1b22d 100644
--- a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
+++ b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
@@ -4,6 +4,7 @@ dataset:
   licenses:
     - name: Creative Commons Attribution-ShareAlike 3.0
       url: https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License
+  update_period_days: '365'
 tables:
   qubits:
     variables:
@@ -13,4 +14,32 @@ tables:
         short_unit: qb
         display:
           numDecimalPlaces: 0
+        description_key:
+          - '['
+          - '"'
+          - t
+          - e
+          - s
+          - t
+          - ''
+          - '1'
+          - '"'
+          - ','
+          - ''
+          - '"'
+          - t
+          - e
+          - s
+          - t
+          - ''
+          - '2'
+          - '"'
+          - ']'
+        description_from_producer: |-
+          This list contains quantum processors, also known as quantum processing units (QPUs). Some devices listed below have only been announced at press conferences so far, with no actual demonstrations or scientific publications characterizing the performance.
+
+          Quantum processors are difficult to compare due to the different architectures and approaches. Due to this, published qubit numbers do not reflect the performance levels of the processor. This is instead achieved through benchmarking metrics such as quantum volume, randomized benchmarking or circuit layer operations per second (CLOPS).
+
+          These QPUs are based on the quantum circuit and quantum logic gate-based model of computing.
+        processing_level: minor
         description: Highest number of quantum bits in a single circuit-based quantum processor over time

From 22648a3523c7c8ede67671685bd7f9c1e27e0c43 Mon Sep 17 00:00:00 2001
From: Pablo Rosado <pabloarosado@gmail.com>
Date: Wed, 17 Apr 2024 14:13:14 +0200
Subject: [PATCH 53/61] Update climate data and fix co2 concentration issue
 (#2531)

* Archive unused climate steps

* Add steps for all updated climate datasets, including EPA datasets

* Archive unused climate steps
---
 dag/archive/climate.yml                       | 229 +++++++++++++++
 dag/climate.yml                               | 275 ++++++------------
 .../climate_change_impacts.meta.yml           |  24 ++
 .../2024-04-17/climate_change_impacts.py      | 174 +++++++++++
 .../2024-04-17/ghg_concentration.meta.yml     |  44 +++
 .../climate/2024-04-17/ghg_concentration.py   | 143 +++++++++
 .../long_run_ghg_concentration.meta.yml       |  27 ++
 .../2024-04-17/long_run_ghg_concentration.py  |  84 ++++++
 .../2024-04-17/ocean_heat_content.meta.yml    |  29 ++
 .../climate/2024-04-17/ocean_heat_content.py  |  45 +++
 .../2024-04-17/ocean_ph_levels.meta.yml       |  22 ++
 .../climate/2024-04-17/ocean_ph_levels.py     |  86 ++++++
 .../climate/2024-04-17/sea_ice_index.meta.yml |  19 ++
 .../climate/2024-04-17/sea_ice_index.py       |  44 +++
 .../sea_surface_temperature.meta.yml          |  29 ++
 .../2024-04-17/sea_surface_temperature.py     |  48 +++
 .../2024-04-17/snow_cover_extent.meta.yml     |  23 ++
 .../climate/2024-04-17/snow_cover_extent.py   |  97 ++++++
 .../surface_temperature_analysis.meta.yml     |  20 ++
 .../surface_temperature_analysis.py           |  56 ++++
 .../epa/2024-04-17/ghg_concentration.meta.yml |  30 ++
 .../epa/2024-04-17/ghg_concentration.py       |  75 +++++
 .../ice_sheet_mass_balance.meta.yml           |  31 ++
 .../epa/2024-04-17/ice_sheet_mass_balance.py  |  91 ++++++
 .../mass_balance_us_glaciers.meta.yml         |  17 ++
 .../2024-04-17/mass_balance_us_glaciers.py    |  39 +++
 .../2024-04-17/ocean_heat_content.meta.yml    |  34 +++
 .../epa/2024-04-17/ocean_heat_content.py      |  35 +++
 .../climate_change_impacts_annual.py          |  34 +++
 .../climate_change_impacts_monthly.py         |  37 +++
 .../climate/2024-04-17/ghg_concentration.py   |  42 +++
 .../2024-04-17/hawaii_ocean_time_series.py    |  29 ++
 .../climate/2024-04-17/ocean_heat_content.py  |  75 +++++
 .../climate/2024-04-17/sea_ice_index.py       |  51 ++++
 .../2024-04-17/sea_surface_temperature.py     |  49 ++++
 .../climate/2024-04-17/snow_cover_extent.py   |  50 ++++
 .../surface_temperature_analysis.py           |  62 ++++
 .../epa/2024-04-17/ghg_concentration.py       |  69 +++++
 .../epa/2024-04-17/ice_sheet_mass_balance.py  |  31 ++
 .../2024-04-17/mass_balance_us_glaciers.py    |  27 ++
 .../epa/2024-04-17/ocean_heat_content.py      |  63 ++++
 .../ch4_concentration_monthly.csv.dvc         |  23 ++
 .../2024-04-17/climate_change_impacts.py      | 202 +++++++++++++
 .../co2_concentration_monthly.csv.dvc         |  23 ++
 .../hawaii_ocean_time_series.csv.dvc          |  25 ++
 .../n2o_concentration_monthly.csv.dvc         |  23 ++
 ...an_heat_content_annual_world_2000m.csv.dvc |  27 ++
 ...ean_heat_content_annual_world_700m.csv.dvc |  27 ++
 ...n_heat_content_monthly_world_2000m.csv.dvc |  28 ++
 ...an_heat_content_monthly_world_700m.csv.dvc |  28 ++
 .../climate/2024-04-17/sea_ice_index.xlsx.dvc |  19 ++
 ...ce_temperature_northern_hemisphere.csv.dvc |  26 ++
 ...ce_temperature_southern_hemisphere.csv.dvc |  26 ++
 .../sea_surface_temperature_world.csv.dvc     |  26 ++
 .../snow_cover_extent_north_america.csv.dvc   |  22 ++
 ...w_cover_extent_northern_hemisphere.csv.dvc |  22 ++
 ...ature_analysis_northern_hemisphere.csv.dvc |  19 ++
 ...ature_analysis_southern_hemisphere.csv.dvc |  19 ++
 ...surface_temperature_analysis_world.csv.dvc |  19 ++
 .../epa/2024-04-17/ch4_concentration.csv.dvc  |  29 ++
 .../2024-04-17/climate_change_indicators.py   |  43 +++
 .../epa/2024-04-17/co2_concentration.csv.dvc  |  29 ++
 .../2024-04-17/ice_sheet_mass_balance.csv.dvc |  37 +++
 .../mass_balance_us_glaciers.csv.dvc          |  28 ++
 .../epa/2024-04-17/n2o_concentration.csv.dvc  |  29 ++
 ...an_heat_content_annual_world_2000m.csv.dvc |  32 ++
 ...ean_heat_content_annual_world_700m.csv.dvc |  32 ++
 67 files changed, 3167 insertions(+), 185 deletions(-)
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml
 create mode 100644 etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py
 create mode 100644 etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml
 create mode 100644 etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py
 create mode 100644 etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml
 create mode 100644 etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py
 create mode 100644 etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml
 create mode 100644 etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py
 create mode 100644 etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml
 create mode 100644 etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py
 create mode 100644 etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_annual.py
 create mode 100644 etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_monthly.py
 create mode 100644 etl/steps/data/meadow/climate/2024-04-17/ghg_concentration.py
 create mode 100644 etl/steps/data/meadow/climate/2024-04-17/hawaii_ocean_time_series.py
 create mode 100644 etl/steps/data/meadow/climate/2024-04-17/ocean_heat_content.py
 create mode 100644 etl/steps/data/meadow/climate/2024-04-17/sea_ice_index.py
 create mode 100644 etl/steps/data/meadow/climate/2024-04-17/sea_surface_temperature.py
 create mode 100644 etl/steps/data/meadow/climate/2024-04-17/snow_cover_extent.py
 create mode 100644 etl/steps/data/meadow/climate/2024-04-17/surface_temperature_analysis.py
 create mode 100644 etl/steps/data/meadow/epa/2024-04-17/ghg_concentration.py
 create mode 100644 etl/steps/data/meadow/epa/2024-04-17/ice_sheet_mass_balance.py
 create mode 100644 etl/steps/data/meadow/epa/2024-04-17/mass_balance_us_glaciers.py
 create mode 100644 etl/steps/data/meadow/epa/2024-04-17/ocean_heat_content.py
 create mode 100644 snapshots/climate/2024-04-17/ch4_concentration_monthly.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/climate_change_impacts.py
 create mode 100644 snapshots/climate/2024-04-17/co2_concentration_monthly.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/hawaii_ocean_time_series.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/n2o_concentration_monthly.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/sea_ice_index.xlsx.dvc
 create mode 100644 snapshots/climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/sea_surface_temperature_world.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/snow_cover_extent_north_america.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv.dvc
 create mode 100644 snapshots/climate/2024-04-17/surface_temperature_analysis_world.csv.dvc
 create mode 100644 snapshots/epa/2024-04-17/ch4_concentration.csv.dvc
 create mode 100644 snapshots/epa/2024-04-17/climate_change_indicators.py
 create mode 100644 snapshots/epa/2024-04-17/co2_concentration.csv.dvc
 create mode 100644 snapshots/epa/2024-04-17/ice_sheet_mass_balance.csv.dvc
 create mode 100644 snapshots/epa/2024-04-17/mass_balance_us_glaciers.csv.dvc
 create mode 100644 snapshots/epa/2024-04-17/n2o_concentration.csv.dvc
 create mode 100644 snapshots/epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc
 create mode 100644 snapshots/epa/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc

diff --git a/dag/archive/climate.yml b/dag/archive/climate.yml
index 64995e71adc..ff8675494a9 100644
--- a/dag/archive/climate.yml
+++ b/dag/archive/climate.yml
@@ -18,3 +18,232 @@ steps:
     - snapshot://imbie/2024-01-02/ice_sheet_mass_balance_greenland.csv
   data://garden/imbie/2024-01-02/ice_sheet_mass_balance:
     - data://meadow/imbie/2024-01-02/ice_sheet_mass_balance
+  #
+  # Met Office Hadley Centre - Sea surface temperature.
+  #
+  data://meadow/climate/2024-01-31/sea_surface_temperature:
+    - snapshot://climate/2024-01-31/sea_surface_temperature_world.csv
+    - snapshot://climate/2024-01-31/sea_surface_temperature_northern_hemisphere.csv
+    - snapshot://climate/2024-01-31/sea_surface_temperature_southern_hemisphere.csv
+  #
+  # Met Office Hadley Centre - Sea surface temperature.
+  #
+  data://garden/climate/2024-01-31/sea_surface_temperature:
+    - data://meadow/climate/2024-01-31/sea_surface_temperature
+  #
+  # GISS - Surface temperature analysis.
+  #
+  data://meadow/climate/2024-01-31/surface_temperature_analysis:
+    - snapshot://climate/2024-01-31/surface_temperature_analysis_world.csv
+    - snapshot://climate/2024-01-31/surface_temperature_analysis_northern_hemisphere.csv
+    - snapshot://climate/2024-01-31/surface_temperature_analysis_southern_hemisphere.csv
+  #
+  # GISS - Surface temperature analysis.
+  #
+  data://garden/climate/2024-01-31/surface_temperature_analysis:
+    - data://meadow/climate/2024-01-31/surface_temperature_analysis
+  #
+  # NSIDC - Arctic sea ice extent.
+  #
+  data://meadow/climate/2024-01-31/sea_ice_index:
+    - snapshot://climate/2024-01-31/sea_ice_index.xlsx
+  #
+  # NSIDC - Arctic sea ice extent.
+  #
+  data://garden/climate/2024-01-31/sea_ice_index:
+    - data://meadow/climate/2024-01-31/sea_ice_index
+  #
+  # NOAA National Centers for Environmental Information - Ocean Heat Content.
+  #
+  data://meadow/climate/2024-01-31/ocean_heat_content:
+    - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_700m.csv
+    - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_2000m.csv
+    - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_700m.csv
+    - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_2000m.csv
+  #
+  # NOAA National Centers for Environmental Information - Ocean Heat Content.
+  #
+  data://garden/climate/2024-01-31/ocean_heat_content:
+    - data://meadow/climate/2024-01-31/ocean_heat_content
+  #
+  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
+  #
+  data://meadow/climate/2024-01-31/hawaii_ocean_time_series:
+    - snapshot://climate/2024-01-31/hawaii_ocean_time_series.csv
+  #
+  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
+  #
+  data://garden/climate/2024-01-31/ocean_ph_levels:
+    - data://meadow/climate/2024-01-31/hawaii_ocean_time_series
+  #
+  # Rutgers University Global Snow Lab - Snow Cover Extent.
+  #
+  data://meadow/climate/2024-01-31/snow_cover_extent:
+    - snapshot://climate/2024-01-31/snow_cover_extent_north_america.csv
+    - snapshot://climate/2024-01-31/snow_cover_extent_northern_hemisphere.csv
+  #
+  # Rutgers University Global Snow Lab - Snow Cover Extent.
+  #
+  data://garden/climate/2024-01-31/snow_cover_extent:
+    - data://meadow/climate/2024-01-31/snow_cover_extent
+  #
+  # NOAA Global Monitoring Laboratory - GHG concentration.
+  #
+  data://meadow/climate/2024-01-31/ghg_concentration:
+    - snapshot://climate/2024-01-31/co2_concentration_monthly.csv
+    - snapshot://climate/2024-01-31/ch4_concentration_monthly.csv
+    - snapshot://climate/2024-01-31/n2o_concentration_monthly.csv
+  #
+  # NOAA Global Monitoring Laboratory - GHG concentration.
+  #
+  data://garden/climate/2024-01-31/ghg_concentration:
+    - data://meadow/climate/2024-01-31/ghg_concentration
+  #
+  # Various sources - Long-run greenhouse gas concentration.
+  #
+  data://garden/climate/2024-01-31/long_run_ghg_concentration:
+    - data://garden/epa/2024-01-29/ghg_concentration
+    - data://garden/climate/2024-01-31/ghg_concentration
+  #
+  # Various sources - Climate change impacts.
+  #
+  data://garden/climate/2024-01-31/climate_change_impacts:
+    - data://garden/climate/2024-01-31/surface_temperature_analysis
+    - data://garden/climate/2024-01-31/sea_ice_index
+    - data://garden/climate/2024-01-31/sea_surface_temperature
+    - data://garden/climate/2024-01-31/ocean_heat_content
+    - data://garden/climate/2024-01-31/ocean_ph_levels
+    - data://garden/climate/2024-01-31/snow_cover_extent
+    - data://garden/climate/2024-01-31/ghg_concentration
+    - data://garden/climate/2024-01-31/long_run_ghg_concentration
+    - data://garden/climate/2024-01-28/global_sea_level
+    - data://garden/epa/2024-01-29/ocean_heat_content
+    - data://garden/epa/2024-01-29/ice_sheet_mass_balance
+    - data://garden/epa/2024-01-29/mass_balance_us_glaciers
+  #
+  # Various sources - Climate change impacts (annual).
+  #
+  data://grapher/climate/2024-01-31/climate_change_impacts_annual:
+    - data://garden/climate/2024-01-31/climate_change_impacts
+  #
+  # Various sources - Climate change impacts (monthly).
+  #
+  data://grapher/climate/2024-01-31/climate_change_impacts_monthly:
+    - data://garden/climate/2024-01-31/climate_change_impacts
+  #
+  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
+  #
+  data://meadow/climate/2024-03-11/hawaii_ocean_time_series:
+    - snapshot://climate/2024-03-11/hawaii_ocean_time_series.csv
+  #
+  # NOAA National Centers for Environmental Information - Ocean Heat Content.
+  #
+  data://meadow/climate/2024-03-11/ocean_heat_content:
+    - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_700m.csv
+    - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_2000m.csv
+    - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_700m.csv
+    - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_2000m.csv
+  #
+  # Rutgers University Global Snow Lab - Snow Cover Extent.
+  #
+  data://meadow/climate/2024-03-11/snow_cover_extent:
+    - snapshot://climate/2024-03-11/snow_cover_extent_northern_hemisphere.csv
+    - snapshot://climate/2024-03-11/snow_cover_extent_north_america.csv
+  #
+  # Met Office Hadley Centre - Sea surface temperature.
+  #
+  data://meadow/climate/2024-03-11/sea_surface_temperature:
+    - snapshot://climate/2024-03-11/sea_surface_temperature_northern_hemisphere.csv
+    - snapshot://climate/2024-03-11/sea_surface_temperature_southern_hemisphere.csv
+    - snapshot://climate/2024-03-11/sea_surface_temperature_world.csv
+  #
+  # NSIDC - Arctic sea ice extent.
+  #
+  data://meadow/climate/2024-03-11/sea_ice_index:
+    - snapshot://climate/2024-03-11/sea_ice_index.xlsx
+  #
+  # GISS - Surface temperature analysis.
+  #
+  data://meadow/climate/2024-03-11/surface_temperature_analysis:
+    - snapshot://climate/2024-03-11/surface_temperature_analysis_southern_hemisphere.csv
+    - snapshot://climate/2024-03-11/surface_temperature_analysis_northern_hemisphere.csv
+    - snapshot://climate/2024-03-11/surface_temperature_analysis_world.csv
+  #
+  # NOAA Global Monitoring Laboratory - GHG concentration.
+  #
+  data://meadow/climate/2024-03-11/ghg_concentration:
+    - snapshot://climate/2024-03-11/n2o_concentration_monthly.csv
+    - snapshot://climate/2024-03-11/co2_concentration_monthly.csv
+    - snapshot://climate/2024-03-11/ch4_concentration_monthly.csv
+  #
+  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
+  #
+  data://garden/climate/2024-03-11/ocean_ph_levels:
+    - data://meadow/climate/2024-03-11/hawaii_ocean_time_series
+  #
+  # NOAA National Centers for Environmental Information - Ocean Heat Content.
+  #
+  data://garden/climate/2024-03-11/ocean_heat_content:
+    - data://meadow/climate/2024-03-11/ocean_heat_content
+  #
+  # Rutgers University Global Snow Lab - Snow Cover Extent.
+  #
+  data://garden/climate/2024-03-11/snow_cover_extent:
+    - data://meadow/climate/2024-03-11/snow_cover_extent
+  #
+  # Met Office Hadley Centre - Sea surface temperature.
+  #
+  data://garden/climate/2024-03-11/sea_surface_temperature:
+    - data://meadow/climate/2024-03-11/sea_surface_temperature
+  #
+  # NSIDC - Arctic sea ice extent.
+  #
+  data://garden/climate/2024-03-11/sea_ice_index:
+    - data://meadow/climate/2024-03-11/sea_ice_index
+  #
+  # GISS - Surface temperature analysis.
+  #
+  data://garden/climate/2024-03-11/surface_temperature_analysis:
+    - data://meadow/climate/2024-03-11/surface_temperature_analysis
+  #
+  # NOAA Global Monitoring Laboratory - GHG concentration.
+  #
+  data://garden/climate/2024-03-11/ghg_concentration:
+    - data://meadow/climate/2024-03-11/ghg_concentration
+  #
+  # Various sources - Long-run greenhouse gas concentration.
+  #
+  data://garden/climate/2024-03-11/long_run_ghg_concentration:
+    - data://garden/epa/2024-01-29/ghg_concentration
+    - data://garden/climate/2024-03-11/ghg_concentration
+  #
+  # Various sources - Climate change impacts.
+  #
+  data://garden/climate/2024-03-11/climate_change_impacts:
+    - data://garden/climate/2024-03-11/long_run_ghg_concentration
+    - data://garden/climate/2024-01-28/global_sea_level
+    - data://garden/epa/2024-01-29/ice_sheet_mass_balance
+    - data://garden/epa/2024-01-29/mass_balance_us_glaciers
+    - data://garden/epa/2024-01-29/ocean_heat_content
+    - data://garden/climate/2024-03-11/ocean_heat_content
+    - data://garden/climate/2024-03-11/surface_temperature_analysis
+    - data://garden/climate/2024-03-11/sea_ice_index
+    - data://garden/climate/2024-03-11/ghg_concentration
+    - data://garden/climate/2024-03-11/ocean_ph_levels
+    - data://garden/climate/2024-03-11/sea_surface_temperature
+    - data://garden/climate/2024-03-11/snow_cover_extent
+  #
+  # Various sources - Climate change impacts (monthly).
+  #
+  data://grapher/climate/2024-03-11/climate_change_impacts_monthly:
+    - data://garden/climate/2024-03-11/climate_change_impacts
+  #
+  # Various sources - Climate change impacts (annual).
+  #
+  data://grapher/climate/2024-03-11/climate_change_impacts_annual:
+    - data://garden/climate/2024-03-11/climate_change_impacts
+  #
+  # GISS - Surface temperature analysis.
+  #
+  data://grapher/climate/latest/surface_temperature_analysis:
+    - data://garden/climate/2024-03-11/surface_temperature_analysis
diff --git a/dag/climate.yml b/dag/climate.yml
index fd50b88cbeb..8dd0e2f1a32 100644
--- a/dag/climate.yml
+++ b/dag/climate.yml
@@ -54,127 +54,10 @@ steps:
   data://grapher/climate/2023-12-20/surface_temperature_annual_average:
   - data://garden/climate/2023-12-20/surface_temperature
   #
-  # Met Office Hadley Centre - Sea surface temperature.
-  #
-  data://meadow/climate/2024-01-31/sea_surface_temperature:
-    - snapshot://climate/2024-01-31/sea_surface_temperature_world.csv
-    - snapshot://climate/2024-01-31/sea_surface_temperature_northern_hemisphere.csv
-    - snapshot://climate/2024-01-31/sea_surface_temperature_southern_hemisphere.csv
-  #
-  # Met Office Hadley Centre - Sea surface temperature.
-  #
-  data://garden/climate/2024-01-31/sea_surface_temperature:
-    - data://meadow/climate/2024-01-31/sea_surface_temperature
-  #
-  # GISS - Surface temperature analysis.
-  #
-  data://meadow/climate/2024-01-31/surface_temperature_analysis:
-    - snapshot://climate/2024-01-31/surface_temperature_analysis_world.csv
-    - snapshot://climate/2024-01-31/surface_temperature_analysis_northern_hemisphere.csv
-    - snapshot://climate/2024-01-31/surface_temperature_analysis_southern_hemisphere.csv
-  #
-  # GISS - Surface temperature analysis.
-  #
-  data://garden/climate/2024-01-31/surface_temperature_analysis:
-    - data://meadow/climate/2024-01-31/surface_temperature_analysis
-  #
-  # GISS - Surface temperature analysis.
-  #
-  data://grapher/climate/latest/surface_temperature_analysis:
-    - data://garden/climate/2024-01-31/surface_temperature_analysis
-  #
-  # NSIDC - Arctic sea ice extent.
-  #
-  data://meadow/climate/2024-01-31/sea_ice_index:
-    - snapshot://climate/2024-01-31/sea_ice_index.xlsx
-  #
-  # NSIDC - Arctic sea ice extent.
-  #
-  data://garden/climate/2024-01-31/sea_ice_index:
-    - data://meadow/climate/2024-01-31/sea_ice_index
-  #
-  # NOAA National Centers for Environmental Information - Ocean Heat Content.
-  #
-  data://meadow/climate/2024-01-31/ocean_heat_content:
-    - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_700m.csv
-    - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_2000m.csv
-    - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_700m.csv
-    - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_2000m.csv
-  #
-  # NOAA National Centers for Environmental Information - Ocean Heat Content.
-  #
-  data://garden/climate/2024-01-31/ocean_heat_content:
-    - data://meadow/climate/2024-01-31/ocean_heat_content
-  #
-  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
-  #
-  data://meadow/climate/2024-01-31/hawaii_ocean_time_series:
-    - snapshot://climate/2024-01-31/hawaii_ocean_time_series.csv
-  #
-  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
-  #
-  data://garden/climate/2024-01-31/ocean_ph_levels:
-    - data://meadow/climate/2024-01-31/hawaii_ocean_time_series
-  #
-  # Rutgers University Global Snow Lab - Snow Cover Extent.
-  #
-  data://meadow/climate/2024-01-31/snow_cover_extent:
-    - snapshot://climate/2024-01-31/snow_cover_extent_north_america.csv
-    - snapshot://climate/2024-01-31/snow_cover_extent_northern_hemisphere.csv
-  #
-  # Rutgers University Global Snow Lab - Snow Cover Extent.
-  #
-  data://garden/climate/2024-01-31/snow_cover_extent:
-    - data://meadow/climate/2024-01-31/snow_cover_extent
-  #
-  # NOAA Global Monitoring Laboratory - GHG concentration.
-  #
-  data://meadow/climate/2024-01-31/ghg_concentration:
-    - snapshot://climate/2024-01-31/co2_concentration_monthly.csv
-    - snapshot://climate/2024-01-31/ch4_concentration_monthly.csv
-    - snapshot://climate/2024-01-31/n2o_concentration_monthly.csv
-  #
-  # NOAA Global Monitoring Laboratory - GHG concentration.
-  #
-  data://garden/climate/2024-01-31/ghg_concentration:
-    - data://meadow/climate/2024-01-31/ghg_concentration
-  #
-  # Various sources - Long-run greenhouse gas concentration.
-  #
-  data://garden/climate/2024-01-31/long_run_ghg_concentration:
-    - data://garden/epa/2024-01-29/ghg_concentration
-    - data://garden/climate/2024-01-31/ghg_concentration
-  #
-  # Various sources - Climate change impacts.
-  #
-  data://garden/climate/2024-01-31/climate_change_impacts:
-    - data://garden/climate/2024-01-31/surface_temperature_analysis
-    - data://garden/climate/2024-01-31/sea_ice_index
-    - data://garden/climate/2024-01-31/sea_surface_temperature
-    - data://garden/climate/2024-01-31/ocean_heat_content
-    - data://garden/climate/2024-01-31/ocean_ph_levels
-    - data://garden/climate/2024-01-31/snow_cover_extent
-    - data://garden/climate/2024-01-31/ghg_concentration
-    - data://garden/climate/2024-01-31/long_run_ghg_concentration
-    - data://garden/climate/2024-01-28/global_sea_level
-    - data://garden/epa/2024-01-29/ocean_heat_content
-    - data://garden/epa/2024-01-29/ice_sheet_mass_balance
-    - data://garden/epa/2024-01-29/mass_balance_us_glaciers
-  #
-  # Various sources - Climate change impacts (annual).
-  #
-  data://grapher/climate/2024-01-31/climate_change_impacts_annual:
-    - data://garden/climate/2024-01-31/climate_change_impacts
-  #
-  # Various sources - Climate change impacts (monthly).
-  #
-  data://grapher/climate/2024-01-31/climate_change_impacts_monthly:
-    - data://garden/climate/2024-01-31/climate_change_impacts
-  #
   # Climate change impacts data explorer.
   #
   data://explorers/climate/latest/climate_change_impacts:
-    - data://garden/climate/2024-03-11/climate_change_impacts
+    - data://garden/climate/2024-04-17/climate_change_impacts
   #
   # Global Wildfire Information System - Monthly burned area.
   #
@@ -235,114 +118,136 @@ steps:
   data://grapher/met_office_hadley_centre/2024-03-04/near_surface_temperature:
     - data://garden/met_office_hadley_centre/2024-03-04/near_surface_temperature
   #
-  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
+  # EPA - Climate change indicators (possibly not updateable).
   #
-  data://meadow/climate/2024-03-11/hawaii_ocean_time_series:
-    - snapshot://climate/2024-03-11/hawaii_ocean_time_series.csv
+  data://meadow/epa/2024-04-17/ocean_heat_content:
+    - snapshot://epa/2024-04-17/ocean_heat_content_annual_world_700m.csv
+    - snapshot://epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv
+  data://garden/epa/2024-04-17/ocean_heat_content:
+    - data://meadow/epa/2024-04-17/ocean_heat_content
+  data://meadow/epa/2024-04-17/ice_sheet_mass_balance:
+    - snapshot://epa/2024-04-17/ice_sheet_mass_balance.csv
+  data://garden/epa/2024-04-17/ice_sheet_mass_balance:
+    - data://meadow/epa/2024-04-17/ice_sheet_mass_balance
+  data://meadow/epa/2024-04-17/ghg_concentration:
+    - snapshot://epa/2024-04-17/co2_concentration.csv
+    - snapshot://epa/2024-04-17/ch4_concentration.csv
+    - snapshot://epa/2024-04-17/n2o_concentration.csv
+  data://garden/epa/2024-04-17/ghg_concentration:
+    - data://meadow/epa/2024-04-17/ghg_concentration
+  data://meadow/epa/2024-04-17/mass_balance_us_glaciers:
+    - snapshot://epa/2024-04-17/mass_balance_us_glaciers.csv
+  data://garden/epa/2024-04-17/mass_balance_us_glaciers:
+    - data://meadow/epa/2024-04-17/mass_balance_us_glaciers
   #
-  # NOAA National Centers for Environmental Information - Ocean Heat Content.
+  # Rutgers University Global Snow Lab - Snow Cover Extent.
   #
-  data://meadow/climate/2024-03-11/ocean_heat_content:
-    - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_700m.csv
-    - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_2000m.csv
-    - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_700m.csv
-    - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_2000m.csv
+  data://meadow/climate/2024-04-17/snow_cover_extent:
+    - snapshot://climate/2024-04-17/snow_cover_extent_north_america.csv
+    - snapshot://climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv
   #
-  # Rutgers University Global Snow Lab - Snow Cover Extent.
+  # NOAA National Centers for Environmental Information - Ocean Heat Content.
   #
-  data://meadow/climate/2024-03-11/snow_cover_extent:
-    - snapshot://climate/2024-03-11/snow_cover_extent_northern_hemisphere.csv
-    - snapshot://climate/2024-03-11/snow_cover_extent_north_america.csv
+  data://meadow/climate/2024-04-17/ocean_heat_content:
+    - snapshot://climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv
+    - snapshot://climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv
+    - snapshot://climate/2024-04-17/ocean_heat_content_annual_world_700m.csv
+    - snapshot://climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv
   #
-  # Met Office Hadley Centre - Sea surface temperature.
+  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
   #
-  data://meadow/climate/2024-03-11/sea_surface_temperature:
-    - snapshot://climate/2024-03-11/sea_surface_temperature_northern_hemisphere.csv
-    - snapshot://climate/2024-03-11/sea_surface_temperature_southern_hemisphere.csv
-    - snapshot://climate/2024-03-11/sea_surface_temperature_world.csv
+  data://meadow/climate/2024-04-17/hawaii_ocean_time_series:
+    - snapshot://climate/2024-04-17/hawaii_ocean_time_series.csv
   #
   # NSIDC - Arctic sea ice extent.
   #
-  data://meadow/climate/2024-03-11/sea_ice_index:
-    - snapshot://climate/2024-03-11/sea_ice_index.xlsx
+  data://meadow/climate/2024-04-17/sea_ice_index:
+    - snapshot://climate/2024-04-17/sea_ice_index.xlsx
+  #
+  # Met Office Hadley Centre - Sea surface temperature.
+  #
+  data://meadow/climate/2024-04-17/sea_surface_temperature:
+    - snapshot://climate/2024-04-17/sea_surface_temperature_world.csv
+    - snapshot://climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv
+    - snapshot://climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv
   #
   # GISS - Surface temperature analysis.
   #
-  data://meadow/climate/2024-03-11/surface_temperature_analysis:
-    - snapshot://climate/2024-03-11/surface_temperature_analysis_southern_hemisphere.csv
-    - snapshot://climate/2024-03-11/surface_temperature_analysis_northern_hemisphere.csv
-    - snapshot://climate/2024-03-11/surface_temperature_analysis_world.csv
+  data://meadow/climate/2024-04-17/surface_temperature_analysis:
+    - snapshot://climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv
+    - snapshot://climate/2024-04-17/surface_temperature_analysis_world.csv
+    - snapshot://climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv
   #
   # NOAA Global Monitoring Laboratory - GHG concentration.
   #
-  data://meadow/climate/2024-03-11/ghg_concentration:
-    - snapshot://climate/2024-03-11/n2o_concentration_monthly.csv
-    - snapshot://climate/2024-03-11/co2_concentration_monthly.csv
-    - snapshot://climate/2024-03-11/ch4_concentration_monthly.csv
+  data://meadow/climate/2024-04-17/ghg_concentration:
+    - snapshot://climate/2024-04-17/co2_concentration_monthly.csv
+    - snapshot://climate/2024-04-17/n2o_concentration_monthly.csv
+    - snapshot://climate/2024-04-17/ch4_concentration_monthly.csv
   #
-  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
+  # Rutgers University Global Snow Lab - Snow Cover Extent.
   #
-  data://garden/climate/2024-03-11/ocean_ph_levels:
-    - data://meadow/climate/2024-03-11/hawaii_ocean_time_series
+  data://garden/climate/2024-04-17/snow_cover_extent:
+    - data://meadow/climate/2024-04-17/snow_cover_extent
   #
   # NOAA National Centers for Environmental Information - Ocean Heat Content.
   #
-  data://garden/climate/2024-03-11/ocean_heat_content:
-    - data://meadow/climate/2024-03-11/ocean_heat_content
+  data://garden/climate/2024-04-17/ocean_heat_content:
+    - data://meadow/climate/2024-04-17/ocean_heat_content
   #
-  # Rutgers University Global Snow Lab - Snow Cover Extent.
+  #  School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT).
   #
-  data://garden/climate/2024-03-11/snow_cover_extent:
-    - data://meadow/climate/2024-03-11/snow_cover_extent
+  data://garden/climate/2024-04-17/ocean_ph_levels:
+    - data://meadow/climate/2024-04-17/hawaii_ocean_time_series
   #
-  # Met Office Hadley Centre - Sea surface temperature.
+  # NSIDC - Arctic sea ice extent.
   #
-  data://garden/climate/2024-03-11/sea_surface_temperature:
-    - data://meadow/climate/2024-03-11/sea_surface_temperature
+  data://garden/climate/2024-04-17/sea_ice_index:
+    - data://meadow/climate/2024-04-17/sea_ice_index
   #
-  # NSIDC - Arctic sea ice extent.
+  # Met Office Hadley Centre - Sea surface temperature.
   #
-  data://garden/climate/2024-03-11/sea_ice_index:
-    - data://meadow/climate/2024-03-11/sea_ice_index
+  data://garden/climate/2024-04-17/sea_surface_temperature:
+    - data://meadow/climate/2024-04-17/sea_surface_temperature
   #
   # GISS - Surface temperature analysis.
   #
-  data://garden/climate/2024-03-11/surface_temperature_analysis:
-    - data://meadow/climate/2024-03-11/surface_temperature_analysis
+  data://garden/climate/2024-04-17/surface_temperature_analysis:
+    - data://meadow/climate/2024-04-17/surface_temperature_analysis
   #
   # NOAA Global Monitoring Laboratory - GHG concentration.
   #
-  data://garden/climate/2024-03-11/ghg_concentration:
-    - data://meadow/climate/2024-03-11/ghg_concentration
+  data://garden/climate/2024-04-17/ghg_concentration:
+    - data://meadow/climate/2024-04-17/ghg_concentration
   #
   # Various sources - Long-run greenhouse gas concentration.
   #
-  data://garden/climate/2024-03-11/long_run_ghg_concentration:
-    - data://garden/epa/2024-01-29/ghg_concentration
-    - data://garden/climate/2024-03-11/ghg_concentration
+  data://garden/climate/2024-04-17/long_run_ghg_concentration:
+    - data://garden/epa/2024-04-17/ghg_concentration
+    - data://garden/climate/2024-04-17/ghg_concentration
   #
   # Various sources - Climate change impacts.
   #
-  data://garden/climate/2024-03-11/climate_change_impacts:
-    - data://garden/climate/2024-03-11/long_run_ghg_concentration
+  data://garden/climate/2024-04-17/climate_change_impacts:
+    - data://garden/epa/2024-04-17/ocean_heat_content
+    - data://garden/epa/2024-04-17/mass_balance_us_glaciers
+    - data://garden/climate/2024-04-17/sea_ice_index
     - data://garden/climate/2024-01-28/global_sea_level
-    - data://garden/epa/2024-01-29/ice_sheet_mass_balance
-    - data://garden/epa/2024-01-29/mass_balance_us_glaciers
-    - data://garden/epa/2024-01-29/ocean_heat_content
-    - data://garden/climate/2024-03-11/ocean_heat_content
-    - data://garden/climate/2024-03-11/surface_temperature_analysis
-    - data://garden/climate/2024-03-11/sea_ice_index
-    - data://garden/climate/2024-03-11/ghg_concentration
-    - data://garden/climate/2024-03-11/ocean_ph_levels
-    - data://garden/climate/2024-03-11/sea_surface_temperature
-    - data://garden/climate/2024-03-11/snow_cover_extent
+    - data://garden/epa/2024-04-17/ice_sheet_mass_balance
+    - data://garden/climate/2024-04-17/ghg_concentration
+    - data://garden/climate/2024-04-17/ocean_ph_levels
+    - data://garden/climate/2024-04-17/surface_temperature_analysis
+    - data://garden/climate/2024-04-17/snow_cover_extent
+    - data://garden/climate/2024-04-17/sea_surface_temperature
+    - data://garden/climate/2024-04-17/ocean_heat_content
+    - data://garden/climate/2024-04-17/long_run_ghg_concentration
   #
-  # Various sources - Climate change impacts (monthly).
+  # Various sources - Climate change impacts (annual).
   #
-  data://grapher/climate/2024-03-11/climate_change_impacts_monthly:
-    - data://garden/climate/2024-03-11/climate_change_impacts
+  data://grapher/climate/2024-04-17/climate_change_impacts_annual:
+    - data://garden/climate/2024-04-17/climate_change_impacts
   #
-  # Various sources - Climate change impacts (annual).
+  # Various sources - Climate change impacts (monthly).
   #
-  data://grapher/climate/2024-03-11/climate_change_impacts_annual:
-    - data://garden/climate/2024-03-11/climate_change_impacts
+  data://grapher/climate/2024-04-17/climate_change_impacts_monthly:
+    - data://garden/climate/2024-04-17/climate_change_impacts
diff --git a/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml
new file mode 100644
index 00000000000..2a5bbd540b2
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml
@@ -0,0 +1,24 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+
+dataset:
+  title: Climate Change Impacts
+  update_period_days: 60
+
+tables:
+  climate_change_impacts_annual:
+    title: Climate Change Impacts - Annual
+    variables:
+      arctic_sea_ice_extent_min:
+        title: Minimum Arctic sea ice extent
+      arctic_sea_ice_extent_max:
+        title: Maximum Arctic sea ice extent
+      antarctic_sea_ice_extent_min:
+        title: Minimum Antarctic sea ice extent
+      antarctic_sea_ice_extent_max:
+        title: Maximum Antarctic sea ice extent
+  climate_change_impacts_monthly:
+    title: Climate Change Impacts - Monthly
diff --git a/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py
new file mode 100644
index 00000000000..38f00ffd808
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py
@@ -0,0 +1,174 @@
+"""Create a garden dataset with all climate change impacts data.
+
+"""
+
+from owid.catalog import Table
+from owid.datautils.dataframes import combine_two_overlapping_dataframes
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def prepare_sea_ice_extent(tb_nsidc: Table) -> Table:
+    tb_nsidc = tb_nsidc.copy()
+    # Create a table with the minimum and maximum Arctic sea ice extent.
+    # Assume minimum and maximum occur in September and February every year.
+    tb_nsidc["month"] = tb_nsidc["date"].astype(str).str[5:7]
+    tb_nsidc["year"] = tb_nsidc["date"].astype(str).str[0:4].astype(int)
+    arctic_sea_ice_extent = (
+        tb_nsidc[(tb_nsidc["location"] == "Northern Hemisphere") & (tb_nsidc["month"].isin(["02", "09"]))]
+        .pivot(index=["location", "year"], columns=["month"], values="sea_ice_extent", join_column_levels_with=" ")
+        .rename(columns={"02": "arctic_sea_ice_extent_max", "09": "arctic_sea_ice_extent_min"}, errors="raise")
+    )
+    # Instead of calling the location a generic "Northern Hemisphere", call it "Arctic Ocean".
+    arctic_sea_ice_extent["location"] = "Arctic Ocean"
+
+    # Idem for the Antarctic sea ice extent.
+    # Assume maximum and minimum occur in September and February every year.
+    antarctic_sea_ice_extent = (
+        tb_nsidc[(tb_nsidc["location"] == "Southern Hemisphere") & (tb_nsidc["month"].isin(["02", "09"]))]
+        .pivot(index=["location", "year"], columns=["month"], values="sea_ice_extent", join_column_levels_with=" ")
+        .rename(columns={"02": "antarctic_sea_ice_extent_min", "09": "antarctic_sea_ice_extent_max"}, errors="raise")
+    )
+    # Instead of calling the location a generic "Southern Hemisphere", call it "Antarctica".
+    antarctic_sea_ice_extent["location"] = "Antarctica"
+
+    return arctic_sea_ice_extent, antarctic_sea_ice_extent
+
+
+def prepare_ocean_heat_content(tb_ocean_heat_annual: Table, tb_ocean_heat_annual_epa: Table) -> Table:
+    # Combine NOAA's annual data on ocean heat content (which is more up-to-date) with the analogous EPA's data based on
+    # NOAA (which, for some reason, spans a longer time range for 2000m). Prioritize NOAA's data on common years.
+    tb_ocean_heat_annual = combine_two_overlapping_dataframes(
+        tb_ocean_heat_annual.rename(
+            columns={
+                "ocean_heat_content_700m": "ocean_heat_content_noaa_700m",
+                "ocean_heat_content_2000m": "ocean_heat_content_noaa_2000m",
+            },
+            errors="raise",
+        ),
+        tb_ocean_heat_annual_epa,
+        index_columns=["location", "year"],
+    )
+    # Recover the original indicator titles (they are empty because of combining two columns with different titles).
+    tb_ocean_heat_annual["ocean_heat_content_noaa_700m"].metadata.title = tb_ocean_heat_annual_epa[
+        "ocean_heat_content_noaa_700m"
+    ].metadata.title
+    tb_ocean_heat_annual["ocean_heat_content_noaa_2000m"].metadata.title = tb_ocean_heat_annual_epa[
+        "ocean_heat_content_noaa_2000m"
+    ].metadata.title
+
+    return tb_ocean_heat_annual
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load GISS dataset surface temperature analysis, and read monthly data.
+    ds_giss = paths.load_dataset("surface_temperature_analysis")
+    tb_giss = ds_giss["surface_temperature_analysis"].reset_index()
+
+    # Load NSIDC dataset of sea ice index.
+    ds_nsidc = paths.load_dataset("sea_ice_index")
+    tb_nsidc = ds_nsidc["sea_ice_index"].reset_index()
+
+    # Load Met Office dataset on sea surface temperature.
+    ds_met_office = paths.load_dataset("sea_surface_temperature")
+    tb_met_office = ds_met_office["sea_surface_temperature"].reset_index()
+
+    # Load NOAA/NCIE dataset on ocean heat content.
+    ds_ocean_heat = paths.load_dataset("ocean_heat_content", namespace="climate")
+    tb_ocean_heat_monthly = ds_ocean_heat["ocean_heat_content_monthly"].reset_index()
+    tb_ocean_heat_annual = ds_ocean_heat["ocean_heat_content_annual"].reset_index()
+
+    # Load EPA's compilation of data on ocean heat content.
+    ds_epa = paths.load_dataset("ocean_heat_content", namespace="epa")
+    tb_ocean_heat_annual_epa = ds_epa["ocean_heat_content"].reset_index()
+
+    # Load ocean pH data from the School of Ocean and Earth Science and Technology.
+    ds_ocean_ph = paths.load_dataset("ocean_ph_levels")
+    tb_ocean_ph = ds_ocean_ph["ocean_ph_levels"].reset_index()
+
+    # Load snow cover extent from Rutgers University Global Snow Lab.
+    ds_snow = paths.load_dataset("snow_cover_extent")
+    tb_snow = ds_snow["snow_cover_extent"].reset_index()
+
+    # Load ice sheet mass balance data from EPA.
+    ds_ice_sheet = paths.load_dataset("ice_sheet_mass_balance")
+    tb_ice_sheet = ds_ice_sheet["ice_sheet_mass_balance"].reset_index()
+
+    # Load annual data on mass balance of US glaciers from EPA.
+    ds_us_glaciers = paths.load_dataset("mass_balance_us_glaciers")
+    tb_us_glaciers = ds_us_glaciers["mass_balance_us_glaciers"].reset_index()
+
+    # Load monthly greenhouse gas concentration data from NOAA/GML.
+    ds_gml = paths.load_dataset("ghg_concentration")
+    tb_gml = ds_gml["ghg_concentration"].reset_index()
+
+    # Load long-run yearly greenhouse gas concentration data.
+    ds_ghg = paths.load_dataset("long_run_ghg_concentration")
+    tb_ghg = ds_ghg["long_run_ghg_concentration"].reset_index()
+
+    # Load global sea level.
+    ds_sea_level = paths.load_dataset("global_sea_level")
+    tb_sea_level = ds_sea_level["global_sea_level"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Prepare sea ice extent data.
+    arctic_sea_ice_extent, antarctic_sea_ice_extent = prepare_sea_ice_extent(tb_nsidc=tb_nsidc)
+
+    # Prepare ocean heat content data.
+    tb_ocean_heat_annual = prepare_ocean_heat_content(
+        tb_ocean_heat_annual=tb_ocean_heat_annual, tb_ocean_heat_annual_epa=tb_ocean_heat_annual_epa
+    )
+
+    # Gather monthly data from different tables.
+    tb_monthly = tb_giss.astype({"date": str}).copy()
+    # NOTE: The values in tb_ocean_ph are monthly, but the dates are not consistently on the middle of the month.
+    #  Instead, they are on different days of the month. When merging with other tables, this will create many nans.
+    #  We could reindex linearly, but it's not a big deal.
+    for table in [
+        tb_nsidc,
+        tb_met_office,
+        tb_ocean_heat_monthly,
+        tb_ocean_ph,
+        tb_snow,
+        tb_ice_sheet,
+        tb_gml,
+        tb_sea_level,
+    ]:
+        tb_monthly = tb_monthly.merge(
+            table.astype({"date": str}),
+            how="outer",
+            on=["location", "date"],
+            validate="one_to_one",
+            short_name="climate_change_impacts_monthly",
+        )
+
+    # Gather annual data from different tables.
+    tb_annual = tb_ocean_heat_annual.copy()
+    for table in [arctic_sea_ice_extent, antarctic_sea_ice_extent, tb_ghg, tb_us_glaciers.astype({"year": int})]:
+        tb_annual = tb_annual.merge(
+            table,
+            how="outer",
+            on=["location", "year"],
+            validate="one_to_one",
+            short_name="climate_change_impacts_annual",
+        )
+    tb_annual.metadata.short_name = "climate_change_impacts_annual"
+
+    # Set an appropriate index to monthly and annual tables, and sort conveniently.
+    tb_monthly = tb_monthly.set_index(["location", "date"], verify_integrity=True).sort_index()
+    tb_annual = tb_annual.set_index(["location", "year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create explorer dataset with combined table in csv format.
+    ds_explorer = create_dataset(dest_dir, tables=[tb_annual, tb_monthly])
+    ds_explorer.save()
diff --git a/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml
new file mode 100644
index 00000000000..ca5e6073998
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml
@@ -0,0 +1,44 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+    description_short: Measured in parts per million.
+
+dataset:
+  update_period_days: 60
+
+tables:
+  ghg_concentration:
+    title: Monthly greenhouse gas concentration
+    variables:
+      co2_concentration:
+        title: Monthly concentration of atmospheric carbon dioxide
+        processing_level: minor
+        unit: parts per million
+        short_unit: ppm
+      ch4_concentration:
+        title: Monthly concentration of atmospheric methane
+        processing_level: minor
+        unit: parts per billion
+        short_unit: ppb
+      n2o_concentration:
+        title: Monthly concentration of atmospheric nitrous oxide
+        processing_level: minor
+        unit: parts per billion
+        short_unit: ppb
+      co2_concentration_yearly_average:
+        title: Rolling yearly average of the concentration of atmospheric carbon dioxide
+        processing_level: major
+        unit: parts per million
+        short_unit: ppm
+      ch4_concentration_yearly_average:
+        title: Rolling yearly average of the concentration of atmospheric methane
+        processing_level: major
+        unit: parts per billion
+        short_unit: ppb
+      n2o_concentration_yearly_average:
+        title: Rolling yearly average of the concentration of atmospheric nitrous oxide
+        processing_level: major
+        unit: parts per billion
+        short_unit: ppb
diff --git a/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py
new file mode 100644
index 00000000000..36d76ea290b
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py
@@ -0,0 +1,143 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from typing import List
+
+import pandas as pd
+from owid.catalog import Table
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Columns to select from the data, and how to rename them.
+COLUMNS = {
+    "year": "year",
+    "month": "month",
+    "average": "concentration",
+    # The following column is loaded only to perform a sanity check.
+    "decimal": "decimal",
+}
+
+
+def add_rolling_average(tb: Table, original_column_names: List[str]) -> Table:
+    tb_with_average = tb.copy()
+
+    # Create a date range of each month (on the 15th).
+    # NOTE: The minimum date in the data is "2001-01-15", however, when passing this date to pd.date_range with
+    # freq="MS", the first point is dismissed because it is not the start of a month. For that reason, we shift the
+    # first point to be at the beginning of the month.
+    date_range = pd.date_range(
+        start=tb_with_average["date"].min() - pd.tseries.offsets.MonthBegin(1),
+        end=tb_with_average["date"].max(),
+        freq="MS",
+    ) + pd.DateOffset(days=14)
+
+    # Get unique locations.
+    unique_locations = tb_with_average["location"].unique()
+
+    # Set date as index and sort.
+    tb_with_average = tb_with_average.set_index(["location", "date"]).sort_index()
+
+    # Create a MultiIndex with all possible combinations of date and location.
+    multi_index = pd.MultiIndex.from_product([unique_locations, date_range], names=["location", "date"])
+
+    # Reindex using the MultiIndex.
+    tb_with_average = tb_with_average.reindex(multi_index)
+
+    for original_column_name in original_column_names:
+        # Create a rolling average with a window of one year, linearly interpolating missing values.
+        # NOTE: Currently no interpolation is needed, as no data points are missing (and in fact date_range is identical
+        # to the dates in the data). However, we need to interpolate in case there are missing points. Otherwise all
+        # points after the missing one will be nan.
+        tb_with_average[f"{original_column_name}_yearly_average"] = (
+            tb_with_average[original_column_name]
+            .interpolate("linear")
+            .rolling(12)
+            .mean()
+            .copy_metadata(tb_with_average[original_column_name])
+        )
+
+    # Drop empty rows.
+    tb_with_average = tb_with_average.dropna(subset=original_column_names, how="all").reset_index()
+
+    # Sort conveniently.
+    tb_with_average = tb_with_average.sort_values(["location", "date"]).reset_index(drop=True)
+
+    for original_column_name in original_column_names:
+        # Check that the values of the original column have not been altered.
+        error = f"The values of the original {original_column_name} column have been altered."
+        assert tb_with_average[original_column_name].astype(float).equals(tb[original_column_name].astype(float)), error
+
+    return tb_with_average
+
+
+def prepare_gas_data(tb: Table) -> Table:
+    tb = tb.copy()
+
+    # Extract gas name from table's short name.
+    gas = tb.metadata.short_name.split("_")[0]
+
+    # Columns to select from the data, and how to rename them.
+    columns = {
+        "year": "year",
+        "month": "month",
+        "average": f"{gas}_concentration",
+        # The following column is loaded only to perform a sanity check.
+        "decimal": "decimal",
+    }
+
+    # Select necessary columns and rename them.
+    tb = tb[list(columns)].rename(columns=columns, errors="raise")
+
+    # There is a "decimal" column for the year as a decimal number, that only has 12 possible values, corresponding to
+    # the middle of each month, so we will assume the 15th of each month.
+    error = "Date format has changed."
+    assert len(set(tb["decimal"].astype(str).str.split(".").str[1])) == 12, error
+    assert set(tb["month"]) == set(range(1, 13)), error
+    tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=15))
+
+    # Remove unnecessary columns.
+    tb = tb.drop(columns=["year", "month", "decimal"], errors="raise")
+
+    # Add a location column.
+    tb["location"] = "World"
+
+    # Add a column with a rolling average for each gas.
+    tb = add_rolling_average(tb=tb, original_column_names=[f"{gas}_concentration"])
+
+    return tb
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow = paths.load_dataset("ghg_concentration")
+    tb_co2 = ds_meadow["co2_concentration_monthly"].reset_index()
+    tb_ch4 = ds_meadow["ch4_concentration_monthly"].reset_index()
+    tb_n2o = ds_meadow["n2o_concentration_monthly"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Prepare data for each gas.
+    tb_co2 = prepare_gas_data(tb=tb_co2)
+    tb_ch4 = prepare_gas_data(tb=tb_ch4)
+    tb_n2o = prepare_gas_data(tb=tb_n2o)
+
+    # Combine data for different gases.
+    tb = tb_co2.merge(tb_ch4, how="outer", on=["location", "date"]).merge(
+        tb_n2o, how="outer", on=["location", "date"], short_name=paths.short_name
+    )
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml
new file mode 100644
index 00000000000..b02cba814ea
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml
@@ -0,0 +1,27 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Climate Change
+        - CO2 & Greenhouse Gas Emissions
+    description_processing: |-
+      - Long-run data from ice core studies has been merged with recent measurements of atmospheric concentration of greenhouse gases.
+
+dataset:
+  update_period_days: 0
+
+tables:
+  long_run_ghg_concentration:
+    variables:
+      co2_concentration:
+        title: Long-run CO₂ concentration
+        unit: parts per million volume
+        short_unit: ppmv
+      ch4_concentration:
+        title: Long-run CH₄ concentration
+        unit: parts per billion volume
+        short_unit: ppbv
+      n2o_concentration:
+        title: Long-run N₂O concentration
+        unit: parts per billion volume
+        short_unit: ppbv
diff --git a/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py
new file mode 100644
index 00000000000..0e07095b425
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py
@@ -0,0 +1,84 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from owid.catalog import Table
+from owid.datautils.dataframes import combine_two_overlapping_dataframes
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def convert_monthly_to_annual(tb_new: Table) -> Table:
+    tb_new = tb_new.copy()
+
+    # Create a year column.
+    tb_new["year"] = tb_new["date"].dt.year
+
+    # Create a table with the number of observations per year.
+    tb_counts = tb_new.groupby("year", as_index=False).agg(
+        {
+            "co2_concentration": "count",
+            "ch4_concentration": "count",
+            "n2o_concentration": "count",
+        }
+    )
+    # Create a table with the average annual values.
+    tb_new = tb_new.groupby("year", as_index=False).agg(
+        {
+            "co2_concentration": "mean",
+            "ch4_concentration": "mean",
+            "n2o_concentration": "mean",
+        }
+    )
+    # Make nan all data points based on less than 12 observations per year.
+    for gas in ["co2", "ch4", "n2o"]:
+        tb_new.loc[tb_counts[f"{gas}_concentration"] < 12, f"{gas}_concentration"] = None
+
+    # Drop empty rows.
+    tb_new = tb_new.dropna(
+        subset=["co2_concentration", "ch4_concentration", "n2o_concentration"], how="all"
+    ).reset_index(drop=True)
+
+    return tb_new
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset on long-run GHG concentrations from EPA, and read its main table.
+    ds_old = paths.load_dataset("ghg_concentration", namespace="epa")
+    tb_old = ds_old["ghg_concentration"].reset_index()
+
+    # Load garden dataset of up-to-date GHG concentrations, and read its main table.
+    ds_new = paths.load_dataset("ghg_concentration", namespace="climate")
+    tb_new = ds_new["ghg_concentration"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Select columns.
+    tb_new = tb_new[["date", "co2_concentration", "ch4_concentration", "n2o_concentration"]].copy()
+
+    # Calculate average annual values.
+    tb_new = convert_monthly_to_annual(tb_new=tb_new)
+
+    # Combine old and new data, prioritizing the latter.
+    tb = combine_two_overlapping_dataframes(df1=tb_new, df2=tb_old, index_columns=["year"])
+
+    # Rename table.
+    tb.metadata.short_name = paths.short_name
+
+    # Add location column.
+    tb["location"] = "World"
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml
new file mode 100644
index 00000000000..c7f6fb474ea
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml
@@ -0,0 +1,29 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+    processing_level: minor
+    description_short: Measured in 10²² Joules.
+    unit: 10²² Joules
+    short_unit: 10²² J
+
+dataset:
+  title: Ocean Heat Content
+  update_period_days: 60
+
+tables:
+  ocean_heat_content_monthly:
+    title: Ocean Heat Content - Monthly average
+    variables:
+      ocean_heat_content_700m:
+        title: Monthly average ocean heat content for the 0-700 meters layer
+      ocean_heat_content_2000m:
+        title: Monthly average ocean heat content for the 0-2000 meters layer
+  ocean_heat_content_annual:
+    title: Ocean Heat Content - Annual average
+    variables:
+      ocean_heat_content_700m:
+        title: Annual average ocean heat content for the 0-700 meters layer
+      ocean_heat_content_2000m:
+        title: Annual average ocean heat content for the 0-2000 meters layer
diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py
new file mode 100644
index 00000000000..dcbafe0d14c
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py
@@ -0,0 +1,45 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its tables.
+    ds_meadow = paths.load_dataset("ocean_heat_content")
+    tb_monthly = ds_meadow["ocean_heat_content_monthly"].reset_index()
+    tb_annual = ds_meadow["ocean_heat_content_annual"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Improve the format of the date column in monthly date (assume the middle of the month for each data point).
+    tb_monthly["date"] = (
+        tb_monthly["date"].str.split("-").str[0] + "-" + tb_monthly["date"].str.split("-").str[1].str.zfill(2) + "-15"
+    )
+
+    # Replace date column (where all years are given as, e.g. 1955.5, 2000.5) by year column in annual data.
+    tb_annual["year"] = tb_annual["date"].astype(int)
+    tb_annual = tb_annual.drop(columns=["date"], errors="raise")
+
+    # Instead of having a column for depth, create columns of heat content for each depth.
+    tb_monthly["depth"] = tb_monthly["depth"].astype(str) + "m"
+    tb_monthly = tb_monthly.pivot(index=["location", "date"], columns="depth", join_column_levels_with="_")
+    tb_annual["depth"] = tb_annual["depth"].astype(str) + "m"
+    tb_annual = tb_annual.pivot(index=["location", "year"], columns="depth", join_column_levels_with="_")
+
+    # Set an appropriate index to each table and sort conveniently.
+    tb_monthly = tb_monthly.set_index(["location", "date"], verify_integrity=True).sort_index()
+    tb_annual = tb_annual.set_index(["location", "year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb_annual, tb_monthly], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml
new file mode 100644
index 00000000000..d9364bd3280
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml
@@ -0,0 +1,22 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+    unit: pH
+    short_unit: pH
+
+dataset:
+  title: Ocean pH Levels
+  update_period_days: 60
+
+tables:
+  ocean_ph_levels:
+    title: Ocean pH levels
+    variables:
+      ocean_ph:
+        title: Monthly measurement of ocean pH levels
+        processing_level: minor
+      ocean_ph_yearly_average:
+        title: Rolling yearly average of ocean pH levels
+        processing_level: major
diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py
new file mode 100644
index 00000000000..db98a40272e
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py
@@ -0,0 +1,86 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import pandas as pd
+from owid.catalog import Table
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Columns to select from the data, and how to rename them.
+COLUMNS = {
+    "date": "date",
+    "phcalc_insitu": "ocean_ph",
+}
+
+
+def add_rolling_average(tb: Table) -> Table:
+    tb_with_average = tb.copy()
+
+    # Set date as index and sort.
+    tb_with_average = tb_with_average.set_index("date").sort_index()
+
+    # Since values are given at different days of the month, reindex to have a value for each day.
+    tb_with_average = tb_with_average.reindex(
+        pd.date_range(start=tb_with_average.index.min(), end=tb_with_average.index.max(), freq="1D")
+    )
+
+    # Create a rolling average with a window of one year, linearly interpolating missing values.
+    tb_with_average["ocean_ph_yearly_average"] = (
+        tb_with_average["ocean_ph"]
+        .interpolate(method="time")
+        .rolling(365)
+        .mean()
+        .copy_metadata(tb_with_average["ocean_ph"])
+    )
+
+    # Drop empty rows.
+    tb_with_average = (
+        tb_with_average.dropna(subset=["ocean_ph"]).reset_index().rename(columns={"index": "date"}, errors="raise")
+    )
+
+    # Check that the values of the original ocean ph column have not been altered.
+    error = "The values of the original ocean_ph column have been altered."
+    assert tb_with_average["ocean_ph"].equals(
+        tb.dropna(subset=["ocean_ph"]).sort_values("date").reset_index(drop=True)["ocean_ph"]
+    ), error
+
+    return tb_with_average
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its tables.
+    ds_meadow = paths.load_dataset("hawaii_ocean_time_series")
+    tb_meadow = ds_meadow["hawaii_ocean_time_series"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Select and rename columns.
+    tb = tb_meadow[list(COLUMNS)].rename(columns=COLUMNS, errors="raise")
+
+    # Add location column.
+    tb["location"] = "Hawaii"
+
+    # Improve format of date column.
+    tb["date"] = pd.to_datetime(tb["date"], format="%d-%b-%y")
+
+    # Add a column with a rolling average.
+    tb = add_rolling_average(tb=tb)
+
+    # Set an appropriate index to each table and sort conveniently.
+    tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index()
+
+    # Rename table.
+    tb.metadata.short_name = paths.short_name
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml
new file mode 100644
index 00000000000..7facebf9240
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml
@@ -0,0 +1,19 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+
+dataset:
+  title: Sea Ice Index
+  update_period_days: 60
+
+tables:
+  sea_ice_index:
+    variables:
+      sea_ice_extent:
+        title: Sea ice extent
+        # description_short: TODO
+        unit: million square kilometers
+        short_unit: million km²
+        processing_level: minor
diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py
new file mode 100644
index 00000000000..3f8247e42b5
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py
@@ -0,0 +1,44 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import pandas as pd
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("sea_ice_index")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["sea_ice_index"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Remove column with annual average.
+    tb = tb.drop(columns=["annual"])
+
+    # Convert table to long format.
+    tb = tb.melt(id_vars=["location", "year"], var_name="month", value_name="sea_ice_extent")
+
+    # Create column of date, assuming each measurement is taken mid month.
+    tb["date"] = pd.to_datetime(tb["year"].astype(str) + tb["month"].str[0:3] + "15", format="%Y%b%d")
+
+    # Drop empty rows and unnecessary columns.
+    tb = tb.dropna().drop(columns=["year", "month"])
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the combined table.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml
new file mode 100644
index 00000000000..bf9ee9d13dc
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml
@@ -0,0 +1,29 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+    processing_level: minor
+
+dataset:
+  title: Sea surface temperature
+  update_period_days: 60
+
+tables:
+  sea_surface_temperature:
+    variables:
+      sea_temperature_anomaly:
+        title: "Monthly sea surface temperature anomaly"
+        description_short: Measured in degrees Celsius.
+        unit: °C
+        short_unit: °C
+      sea_temperature_anomaly_low:
+        title: "Monthly sea surface temperature anomaly (lower bound)"
+        description_short: Measured in degrees Celsius.
+        unit: °C
+        short_unit: °C
+      sea_temperature_anomaly_high:
+        title: "Monthly sea surface temperature anomaly (upper bound)"
+        description_short: Measured in degrees Celsius.
+        unit: °C
+        short_unit: °C
diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py
new file mode 100644
index 00000000000..2c2fb56098e
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py
@@ -0,0 +1,48 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+# Columns to select from data, and how to rename them.
+COLUMNS = {
+    "year": "year",
+    "month": "month",
+    "location": "location",
+    "anomaly": "sea_temperature_anomaly",
+    "lower_bound_95pct_bias_uncertainty_range": "sea_temperature_anomaly_low",
+    "upper_bound_95pct_bias_uncertainty_range": "sea_temperature_anomaly_high",
+}
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow = paths.load_dataset("sea_surface_temperature")
+    tb = ds_meadow["sea_surface_temperature"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Select and rename columns.
+    tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise")
+
+    # Create a date column (assume the middle of the month for each monthly data point).
+    tb["date"] = tb["year"].astype(str) + "-" + tb["month"].astype(str).str.zfill(2) + "-15"
+
+    # Remove unnecessary columns.
+    tb = tb.drop(columns=["year", "month"], errors="raise")
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the combined table.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml
new file mode 100644
index 00000000000..698ad73c63f
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml
@@ -0,0 +1,23 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+    unit: "square kilometers"
+    short_unit: "km²"
+    description_short: Measured in square kilometers.
+
+dataset:
+  title: Snow Cover Extent
+  update_period_days: 60
+
+tables:
+  snow_cover_extent:
+    title: Snow Cover Extent
+    variables:
+      snow_cover_extent:
+        title: Monthly measurement of the area covered by snow
+        processing_level: minor
+      snow_cover_extent_yearly_average:
+        title: Rolling yearly average of the area covered by snow
+        processing_level: major
diff --git a/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py
new file mode 100644
index 00000000000..f5b5d039b34
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py
@@ -0,0 +1,97 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import pandas as pd
+from owid.catalog import Table
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Columns to select from the data, and how to rename them.
+COLUMNS = {
+    "date": "date",
+    "phcalc_insitu": "ocean_ph",
+}
+
+
+def add_rolling_average(tb: Table, original_column_name: str) -> Table:
+    tb_with_average = tb.copy()
+
+    # Create a date range.
+    date_range = pd.date_range(start=tb_with_average["date"].min(), end=tb_with_average["date"].max(), freq="1D")
+
+    # Get unique locations.
+    unique_locations = tb_with_average["location"].unique()
+
+    # Set date as index and sort.
+    tb_with_average = tb_with_average.set_index(["location", "date"]).sort_index()
+
+    # Create a MultiIndex with all possible combinations of date and location.
+    multi_index = pd.MultiIndex.from_product([unique_locations, date_range], names=["location", "date"])
+
+    # Reindex using the MultiIndex.
+    tb_with_average = tb_with_average.reindex(multi_index)
+
+    # Create a rolling average with a window of one year, linearly interpolating missing values.
+    tb_with_average[f"{original_column_name}_yearly_average"] = (
+        tb_with_average[original_column_name]
+        .interpolate(method="linear")
+        .rolling(365)
+        .mean()
+        .copy_metadata(tb_with_average[original_column_name])
+    )
+
+    # Drop empty rows.
+    tb_with_average = tb_with_average.dropna(subset=[original_column_name]).reset_index()
+
+    # Remove rolling average for the first year, given that it is based on incomplete data.
+    tb_with_average.loc[
+        tb_with_average["date"] < tb_with_average["date"].min() + pd.Timedelta(days=365),
+        f"{original_column_name}_yearly_average",
+    ] = None
+
+    # Sort conveniently.
+    tb_with_average = tb_with_average.sort_values(["location", "date"]).reset_index(drop=True)
+
+    # Check that the values of the original column have not been altered.
+    error = f"The values of the original {original_column_name} column have been altered."
+    assert tb_with_average[original_column_name].astype(int).equals(tb[original_column_name].astype(int)), error
+
+    return tb_with_average
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its tables.
+    ds_meadow = paths.load_dataset("snow_cover_extent")
+    tb = ds_meadow["snow_cover_extent"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Create a date column.
+    # NOTE: Assign the middle of the month.
+    tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=15))
+    tb = tb.drop(columns=["year", "month"], errors="raise")
+
+    # Data starts in 1966, but, as mentioned on their website
+    # https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0
+    # there is missing data between 1968 and 1971.
+    # So, for simplicity, select data from 1972 onwards, where data is complete.
+    tb = tb[tb["date"] >= "1972-01-01"].reset_index(drop=True)
+
+    # Add a column with a rolling average.
+    tb = add_rolling_average(tb=tb, original_column_name="snow_cover_extent")
+
+    # Set an appropriate index to each table and sort conveniently.
+    tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml
new file mode 100644
index 00000000000..eda07f5ae5a
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml
@@ -0,0 +1,20 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+
+dataset:
+  title: GISS surface temperature analysis
+  update_period_days: 60
+
+tables:
+  surface_temperature_analysis:
+    variables:
+      temperature_anomaly:
+        title: "Global warming: monthly temperature anomaly"
+        description_short: |-
+          Combined land-surface air and sea-surface water temperature anomaly, given as the deviation from the 1951-1980 mean, in degrees Celsius.
+        unit: °C
+        short_unit: °C
+        processing_level: minor
diff --git a/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py
new file mode 100644
index 00000000000..43d328abbde
--- /dev/null
+++ b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py
@@ -0,0 +1,56 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import owid.catalog.processing as pr
+import pandas as pd
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow = paths.load_dataset("surface_temperature_analysis")
+    tb = ds_meadow["surface_temperature_analysis_world"]
+
+    #
+    # Process data.
+    #
+    # Initialize dictionary to store processed tables.
+    tables = {}
+    for table_name in ds_meadow.table_names:
+        # Read table.
+        tb = ds_meadow[table_name].reset_index()
+        # Get location from table name.
+        location = table_name.split("surface_temperature_analysis_")[-1].replace("_", " ").title()
+        # Add column for location.
+        tb["location"] = location
+        # Convert table to long format.
+        tb = tb.melt(id_vars=["year", "location"], var_name="month", value_name="temperature_anomaly")
+        # Create column of date, assuming each measurement is taken mid month.
+        tb["date"] = pd.to_datetime(tb["year"].astype(str) + tb["month"] + "15", format="%Y%b%d")
+        # Copy metadata from any other previous column.
+        tb["date"] = tb["date"].copy_metadata(tb["location"])
+        # Select necessary columns.
+        tb = tb[["location", "date", "temperature_anomaly"]]
+        # Remove rows with missing values.
+        tb = tb.dropna(subset=["temperature_anomaly"]).reset_index(drop=True)
+        # Update table.
+        tables[location] = tb
+
+    # Concatenate all tables.
+    tb = pr.concat(list(tables.values()), ignore_index=True, short_name=paths.short_name)
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the combined table.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml
new file mode 100644
index 00000000000..933924e021d
--- /dev/null
+++ b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml
@@ -0,0 +1,30 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+    processing_level: major
+    description_key:
+      - Based on ice core studies of historical concentration of greenhouse gases, and recent air monitoring sites around the world.
+    description_from_producer: |-
+      This indicator describes how the levels of major greenhouse gases (GHGs) in the atmosphere have changed over geological time and in recent years. Changes in atmospheric GHGs, in part caused by human activities, affect the amount of energy held in the Earth-atmosphere system and thus affect the Earth's climate. This indicator is highly relevant to climate change because greenhouse gases from human activities are the primary driver of observed climate change since the mid-20th century (IPCC, 2021).
+
+dataset:
+  update_period_days: 0
+
+tables:
+  ghg_concentration:
+    title: Global Atmospheric Concentrations of Greenhouse Gases
+    variables:
+      co2_concentration:
+        title: Global atmospheric concentration of carbon dioxide
+        unit: parts per million
+        short_unit: ppm
+      ch4_concentration:
+        title: Global atmospheric concentration of methane
+        unit: parts per billion
+        short_unit: ppb
+      n2o_concentration:
+        title: Global atmospheric concentration of nitrous oxide
+        unit: parts per billion
+        short_unit: ppb
diff --git a/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py
new file mode 100644
index 00000000000..e244a717be8
--- /dev/null
+++ b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py
@@ -0,0 +1,75 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from owid.catalog import Table
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Columns to read from the data, and how to rename them.
+COLUMNS_CO2 = {
+    "year": "year",
+    "antarctic_ice_cores": "co2_concentration",
+}
+COLUMNS_CH4 = {
+    "year": "year",
+    "epica_dome_c__antarctica": "ch4_concentration",
+}
+COLUMNS_N2O = {
+    "year": "year",
+    "epica_dome_c__antarctica": "n2o_concentration",
+}
+
+
+def approximate_data_for_each_year(tb: Table, column: str) -> Table:
+    tb = tb.copy()
+
+    # Round each year to its closer integer.
+    tb["year"] = tb["year"].round(0).astype(int)
+
+    # If there are multiple rows for a given year, take the average value.
+    tb = tb.groupby("year", as_index=False).agg({column: "mean"})
+
+    # Remove empty rows.
+    tb = tb.dropna(subset=[column]).reset_index(drop=True)
+
+    return tb
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its tables.
+    ds_meadow = paths.load_dataset("ghg_concentration")
+    tb_co2 = ds_meadow["co2_concentration"].reset_index()
+    tb_ch4 = ds_meadow["ch4_concentration"].reset_index()
+    tb_n2o = ds_meadow["n2o_concentration"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Select and rename columns.
+    tb_co2 = tb_co2[list(COLUMNS_CO2)].rename(columns=COLUMNS_CO2, errors="raise")
+    tb_ch4 = tb_ch4[list(COLUMNS_CH4)].rename(columns=COLUMNS_CH4, errors="raise")
+    tb_n2o = tb_n2o[list(COLUMNS_N2O)].rename(columns=COLUMNS_N2O, errors="raise")
+
+    # Since pandas datetime cannot handle such long past dates, for simplicity, round up years, and take average
+    # concentration of year for which there are multiple rows.
+    tb_co2 = approximate_data_for_each_year(tb_co2, "co2_concentration")
+    tb_ch4 = approximate_data_for_each_year(tb_ch4, "ch4_concentration")
+    tb_n2o = approximate_data_for_each_year(tb_n2o, "n2o_concentration")
+
+    # Combine data for all gases.
+    tb = tb_co2.merge(tb_ch4, on="year", how="outer").merge(tb_n2o, on="year", how="outer", short_name=paths.short_name)
+
+    # Set an appropriate index to each table and sort conveniently.
+    tb = tb.set_index(["year"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml
new file mode 100644
index 00000000000..d7791eb36f8
--- /dev/null
+++ b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml
@@ -0,0 +1,31 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+    processing_level: major
+    unit: billion tonnes
+    short_unit: billion t
+    description_key:
+    - Values are centered at zero in 2002 to provide a consistent point of reference.
+    - A downward slope indicates a net loss of ice and snow.
+    - For reference, 1,000 billion metric tons is enough to raise sea level by about 3 millimeters.
+
+dataset:
+  title: Ice Sheet Mass Balance
+  update_period_days: 0
+
+tables:
+  ice_sheet_mass_balance:
+    title: Ice Sheet Mass Balance
+    variables:
+      cumulative_ice_mass_change_imbie:
+        title: Cumulative change in mass in the ice sheets, according to IMBIE
+        description_short: Measured in billion tonnes. Based on more than 20 different studies that have been combined for each region.
+        presentation:
+          title_variant: IMBIE
+      land_ice_mass_nasa:
+        title: Cumulative change in mass in the ice sheets, according to NASA/JPL
+        description_short: Measured in billion tonnes.
+        presentation:
+          title_variant: NASA/JPL
diff --git a/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py
new file mode 100644
index 00000000000..8c03e21269c
--- /dev/null
+++ b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py
@@ -0,0 +1,91 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import owid.catalog.processing as pr
+import pandas as pd
+from owid.catalog import Table
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def fix_multiple_rows_for_the_same_year(tb: Table) -> Table:
+    # There are repeated years, but there are no ambiguities (i.e. for each column, either the first or the second
+    # repeated year has data, not both).
+    # To fix that, remove nans from each column and merge them together.
+    tb_corrected = tb[["year"]].drop_duplicates().reset_index(drop=True)
+    for column in tb.columns[1:]:
+        tb_column = tb[["year", column]].dropna().reset_index(drop=True)
+        assert tb_column[tb_column.duplicated(subset="year", keep=False)].empty
+        tb_corrected = tb_corrected.merge(tb_column, how="outer", on="year")
+
+    return tb_corrected
+
+
+def decimal_date_to_date(year: int) -> str:
+    return (pd.to_datetime(year, format="%Y") + pd.Timedelta(days=(year % 1) * 364.2425)).date()
+
+
+def separate_antarctica_and_greenland_data(tb: Table) -> Table:
+    columns_antarctica = {
+        "date": "date",
+        "nasa__antarctica_land_ice_mass": "land_ice_mass_nasa",
+        "imbie__antarctica_cumulative_ice_mass_change": "cumulative_ice_mass_change_imbie",
+    }
+    tb_antarctica = (
+        tb[list(columns_antarctica)]
+        .rename(columns=columns_antarctica, errors="raise")
+        .assign(**{"location": "Antarctica"})
+        .copy()
+    )
+    columns_greenland = {
+        "date": "date",
+        "nasa__greenland_land_ice_mass": "land_ice_mass_nasa",
+        "imbie__greenland_cumulative_ice_mass_change": "cumulative_ice_mass_change_imbie",
+    }
+    tb_greenland = (
+        tb[list(columns_greenland)]
+        .rename(columns=columns_greenland, errors="raise")
+        .assign(**{"location": "Greenland"})
+        .copy()
+    )
+
+    # Combine data for Antarctica and Greenland.
+    tb_combined = pr.concat([tb_antarctica, tb_greenland], ignore_index=True)
+
+    return tb_combined
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow = paths.load_dataset("ice_sheet_mass_balance")
+    tb = ds_meadow["ice_sheet_mass_balance"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Fix issue with the original data, where there are multiple rows for the same year.
+    tb = fix_multiple_rows_for_the_same_year(tb=tb)
+
+    # Remove empty rows.
+    tb = tb.dropna(how="all")
+
+    # Create a date column (given that "year" is given with decimals).
+    tb["date"] = tb["year"].apply(decimal_date_to_date).astype(str)
+
+    # Separate data for Antarctica and Greenland.
+    tb = separate_antarctica_and_greenland_data(tb=tb)
+
+    # Set an appropriate index to each table and sort conveniently.
+    tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml
new file mode 100644
index 00000000000..db21db39b00
--- /dev/null
+++ b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml
@@ -0,0 +1,17 @@
+dataset:
+  update_period_days: 0
+
+tables:
+  mass_balance_us_glaciers:
+    title: Mass Balance of Glaciers in the United States
+    variables:
+      mass_balance_us_glaciers:
+        title: Cumulative mass balance
+        unit: meters
+        short_unit: m
+        description_short: |-
+          Measured in meters of water equivalent, which represent changes in the average thickness of a glacier relative to a base year 1965.
+        presentation:
+          topic_tags:
+          - Climate Change
+        processing_level: minor
diff --git a/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py
new file mode 100644
index 00000000000..87faf60c2cf
--- /dev/null
+++ b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py
@@ -0,0 +1,39 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow = paths.load_dataset("mass_balance_us_glaciers")
+    tb = ds_meadow["mass_balance_us_glaciers"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Change column names to human-readable names.
+    tb = tb.rename(
+        columns={column: column.replace("_", " ").title() for column in tb.columns if column != "year"}, errors="raise"
+    )
+
+    # Transpose table to have location as a column.
+    tb = tb.melt(id_vars=["year"], var_name="location", value_name="mass_balance_us_glaciers")
+
+    # Remove empty rows.
+    tb = tb.dropna().reset_index(drop=True)
+
+    # Set an appropriate index to each table and sort conveniently.
+    tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml
new file mode 100644
index 00000000000..0eec41a7f0e
--- /dev/null
+++ b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml
@@ -0,0 +1,34 @@
+definitions:
+  common:
+    presentation:
+      topic_tags:
+      - Climate Change
+    processing_level: minor
+    description_processing: |-
+      The amount of heat in the ocean, or ocean heat content, is an important indicator of climate change because the oceans ultimately absorb a large portion of the extra energy that greenhouse gases trap near the Earth's surface. Ocean heat content also plays an important role in the Earth's climate system because heat from ocean surface waters provides energy for storms and thereby influences weather patterns.
+    description_short: Measured in 10²² Joules.
+    unit: 10²² Joules
+    short_unit: 10²² J
+
+dataset:
+  title: Ocean Heat Content
+  update_period_days: 0
+
+tables:
+  ocean_heat_content:
+    title: Ocean Heat Content
+    variables:
+      ocean_heat_content_iap_2000m:
+        title: Annual average ocean heat content for the 0-2000 meters layer, according to IAP
+      ocean_heat_content_noaa_2000m:
+        title: Annual average ocean heat content for the 0-2000 meters layer, according to NOAA
+      ocean_heat_content_mri_2000m:
+        title: Annual average ocean heat content for the 0-2000 meters layer, according to MRI/JMA
+      ocean_heat_content_mri_700m:
+        title: Annual average ocean heat content for the 0-700 meters layer, according to MRI/JMA
+      ocean_heat_content_noaa_700m:
+        title: Annual average ocean heat content for the 0-700 meters layer, according to NOAA
+      ocean_heat_content_iap_700m:
+        title: Annual average ocean heat content for the 0-700 meters layer, according to IAP
+      ocean_heat_content_csiro_700m:
+        title: Annual average ocean heat content for the 0-700 meters layer, according to CSIRO
diff --git a/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py
new file mode 100644
index 00000000000..df33d15ae7e
--- /dev/null
+++ b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py
@@ -0,0 +1,35 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow = paths.load_dataset("ocean_heat_content")
+    tb = ds_meadow["ocean_heat_content"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Instead of having a column for depth, create columns of heat content for each depth.
+    tb["depth"] = tb["depth"].astype(str) + "m"
+    tb = tb.pivot(index=["location", "year"], columns="depth", join_column_levels_with="_")
+
+    # Delete columns with no data.
+    tb = tb.dropna(how="all", axis=1).reset_index(drop=True)
+
+    # Set an appropriate index to each table and sort conveniently.
+    tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_annual.py b/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_annual.py
new file mode 100644
index 00000000000..d2ce85e4a2d
--- /dev/null
+++ b/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_annual.py
@@ -0,0 +1,34 @@
+"""Load a garden dataset and create a grapher dataset.
+
+"""
+
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset and read its annual table.
+    ds_garden = paths.load_dataset("climate_change_impacts")
+    tb_annual = ds_garden["climate_change_impacts_annual"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Create a country column (required by grapher).
+    tb_annual = tb_annual.rename(columns={"location": "country"}, errors="raise")
+
+    # Set an appropriate index and sort conveniently.
+    tb_annual = tb_annual.set_index(["country", "year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb_annual], check_variables_metadata=True)
+    ds_grapher.save()
diff --git a/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_monthly.py b/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_monthly.py
new file mode 100644
index 00000000000..c69428bae1b
--- /dev/null
+++ b/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_monthly.py
@@ -0,0 +1,37 @@
+"""Load a garden dataset and create a grapher dataset.
+
+"""
+
+from etl.grapher_helpers import adapt_table_with_dates_to_grapher
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset and read its monthly table.
+    ds_garden = paths.load_dataset("climate_change_impacts")
+    tb = ds_garden["climate_change_impacts_monthly"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Create a country column (required by grapher).
+    tb = tb.rename(columns={"location": "country"}, errors="raise")
+
+    # Adapt table with dates to grapher requirements.
+    tb = adapt_table_with_dates_to_grapher(tb)
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/climate/2024-04-17/ghg_concentration.py b/etl/steps/data/meadow/climate/2024-04-17/ghg_concentration.py
new file mode 100644
index 00000000000..1ca24557052
--- /dev/null
+++ b/etl/steps/data/meadow/climate/2024-04-17/ghg_concentration.py
@@ -0,0 +1,42 @@
+"""Load a snapshot and create a meadow dataset."""
+
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Names of snapshot files to load and process.
+FILES = [
+    "co2_concentration_monthly",
+    "ch4_concentration_monthly",
+    "n2o_concentration_monthly",
+]
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Initialize dictionary to store raw tables.
+    tables = {}
+    for file_name in FILES:
+        # Retrieve snapshot.
+        snap = paths.load_snapshot(f"{file_name}.csv")
+
+        # Load data from snapshot.
+        tables[file_name] = snap.read(comment="#", na_values="-9.99")
+
+    #
+    # Process data.
+    #
+    for file_name, tb in tables.items():
+        # Set an appropriate index and sort conveniently.
+        tables[file_name] = tb.set_index(["year", "month"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with one table for each gas.
+    ds_meadow = create_dataset(dest_dir, tables=tables.values(), check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/climate/2024-04-17/hawaii_ocean_time_series.py b/etl/steps/data/meadow/climate/2024-04-17/hawaii_ocean_time_series.py
new file mode 100644
index 00000000000..0544b0cb638
--- /dev/null
+++ b/etl/steps/data/meadow/climate/2024-04-17/hawaii_ocean_time_series.py
@@ -0,0 +1,29 @@
+"""Load a snapshot and create a meadow dataset."""
+
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load snapshot and read its data.
+    tb = paths.load_snapshot("hawaii_ocean_time_series.csv").read(skiprows=8, sep="\t", na_values=[-999])
+
+    #
+    # Process data.
+    #
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["date"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/climate/2024-04-17/ocean_heat_content.py b/etl/steps/data/meadow/climate/2024-04-17/ocean_heat_content.py
new file mode 100644
index 00000000000..844f5d34220
--- /dev/null
+++ b/etl/steps/data/meadow/climate/2024-04-17/ocean_heat_content.py
@@ -0,0 +1,75 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Names of snapshot files to load and process.
+FILES = [
+    "ocean_heat_content_monthly_world_700m",
+    "ocean_heat_content_monthly_world_2000m",
+    "ocean_heat_content_annual_world_700m",
+    "ocean_heat_content_annual_world_2000m",
+]
+
+# Columns to select from annual data, and how to rename them.
+COLUMNS_ANNUAL = {
+    "YEAR": "date",
+    "WO": "ocean_heat_content",
+}
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load data from snapshots.
+    tables_monthly = []
+    tables_annual = []
+    for file_name in FILES:
+        # Extract depth and location from file name.
+        depth = int(file_name.split("_")[-1].replace("m", ""))
+        location = file_name.split("_")[-2].title()
+        if "monthly" in file_name:
+            # Read data.
+            new_table = paths.load_snapshot(f"{file_name}.csv").read(names=["date", "ocean_heat_content"])
+            # Add columns for location and depth.
+            new_table = new_table.assign(**{"depth": depth, "location": location})
+            # Add monthly table to list.
+            tables_monthly.append(new_table)
+        elif "annual" in file_name:
+            # Read data, select and rename columns.
+            new_table = (
+                paths.load_snapshot(f"{file_name}.csv")
+                .read_fwf()[list(COLUMNS_ANNUAL)]
+                .rename(columns=COLUMNS_ANNUAL, errors="raise")
+            )
+            # Add columns for location and depth.
+            new_table = new_table.assign(**{"depth": depth, "location": location})
+            # Add annual table to list.
+            tables_annual.append(new_table)
+        else:
+            raise ValueError(f"Unexpected file name: {file_name}")
+
+    #
+    # Process data.
+    #
+    # Combine monthly data and add a column for location.
+    tb_monthly = pr.concat(tables_monthly, short_name="ocean_heat_content_monthly")
+
+    # Combine annual data.
+    tb_annual = pr.concat(tables_annual, short_name="ocean_heat_content_annual")
+
+    # Set an appropriate index and sort conveniently.
+    tb_monthly = tb_monthly.set_index(["location", "depth", "date"], verify_integrity=True).sort_index()
+    tb_annual = tb_annual.set_index(["location", "depth", "date"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb_annual, tb_monthly], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/climate/2024-04-17/sea_ice_index.py b/etl/steps/data/meadow/climate/2024-04-17/sea_ice_index.py
new file mode 100644
index 00000000000..d4ded1a7859
--- /dev/null
+++ b/etl/steps/data/meadow/climate/2024-04-17/sea_ice_index.py
@@ -0,0 +1,51 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("sea_ice_index.xlsx")
+
+    # Read data from snapshot.
+    data = snap.ExcelFile()
+
+    #
+    # Process data.
+    #
+    # Load sheet of northern hemisphere sea ice extent.
+    tb_nh = data.parse("NH-Extent").assign(**{"location": "Northern Hemisphere"})
+    tb_sh = data.parse("SH-Extent").assign(**{"location": "Southern Hemisphere"})
+
+    # Sanity check.
+    assert tb_nh.iloc[0, 0] == 1978, "First cell in NH spreadsheet was expected to be 1978. Data has changed."
+    assert tb_sh.iloc[0, 0] == 1978, "First cell in SH spreadsheet was expected to be 1978. Data has changed."
+
+    # Concatenate both tables.
+    tb = pr.concat([tb_sh, tb_nh], ignore_index=True, short_name=paths.short_name)
+
+    # Fix column names.
+    tb = tb.rename(columns={tb.columns[0]: "year"})
+
+    # Drop empty rows and columns.
+    tb = tb.dropna(how="all").dropna(axis=1, how="all").reset_index(drop=True)
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/climate/2024-04-17/sea_surface_temperature.py b/etl/steps/data/meadow/climate/2024-04-17/sea_surface_temperature.py
new file mode 100644
index 00000000000..50623be8b7a
--- /dev/null
+++ b/etl/steps/data/meadow/climate/2024-04-17/sea_surface_temperature.py
@@ -0,0 +1,49 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Names of snapshot files to load and process.
+FILES = [
+    "sea_surface_temperature_world",
+    "sea_surface_temperature_northern_hemisphere",
+    "sea_surface_temperature_southern_hemisphere",
+]
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load data from each of the snapshots, and add a column with the region name.
+    tables = [
+        paths.load_snapshot(f"{file_name}.csv")
+        .read()
+        .assign(**{"location": file_name.split("sea_surface_temperature_")[-1].replace("_", " ").title()})
+        for file_name in FILES
+    ]
+
+    #
+    # Process data.
+    #
+    # Concatenate all tables.
+    tb = pr.concat(tables)
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "year", "month"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    # Rename table.
+    tb.metadata.short_name = paths.short_name
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/climate/2024-04-17/snow_cover_extent.py b/etl/steps/data/meadow/climate/2024-04-17/snow_cover_extent.py
new file mode 100644
index 00000000000..86e0d707a8b
--- /dev/null
+++ b/etl/steps/data/meadow/climate/2024-04-17/snow_cover_extent.py
@@ -0,0 +1,50 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Names of snapshot files to load and process.
+FILES = [
+    "snow_cover_extent_north_america",
+    "snow_cover_extent_northern_hemisphere",
+]
+
+# Names of columns in the data.
+COLUMNS = ["year", "month", "snow_cover_extent"]
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load snapshot and read its data.
+    tables = []
+    for file_name in FILES:
+        tb = paths.load_snapshot(f"{file_name}.csv").read_fwf(names=COLUMNS)
+        # Add a column for location.
+        tb["location"] = file_name.split("snow_cover_extent_")[-1].replace("_", " ").title()
+        # Add table to list.
+        tables.append(tb)
+
+    #
+    # Process data.
+    #
+    # Combine data from all tables.
+    tb = pr.concat(tables)
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "year", "month"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    # Update table name.
+    tb.metadata.short_name = paths.short_name
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/climate/2024-04-17/surface_temperature_analysis.py b/etl/steps/data/meadow/climate/2024-04-17/surface_temperature_analysis.py
new file mode 100644
index 00000000000..88791a644b7
--- /dev/null
+++ b/etl/steps/data/meadow/climate/2024-04-17/surface_temperature_analysis.py
@@ -0,0 +1,62 @@
+"""Load a snapshot and create a meadow dataset."""
+
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Names of snapshot files to load and process.
+FILES = [
+    "surface_temperature_analysis_world",
+    "surface_temperature_analysis_northern_hemisphere",
+    "surface_temperature_analysis_southern_hemisphere",
+]
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Initialize dictionary to store raw tables.
+    tables = {}
+    for file_name in FILES:
+        # Retrieve snapshot.
+        snap = paths.load_snapshot(f"{file_name}.csv")
+
+        # Load data from snapshot.
+        tables[file_name] = snap.read(
+            skiprows=1,
+            na_values="***",
+            usecols=[
+                "Year",
+                "Jan",
+                "Feb",
+                "Mar",
+                "Apr",
+                "May",
+                "Jun",
+                "Jul",
+                "Aug",
+                "Sep",
+                "Oct",
+                "Nov",
+                "Dec",
+            ],
+        )
+
+    #
+    # Process data.
+    #
+    for file_name, tb in tables.items():
+        # Set an appropriate index and sort conveniently.
+        tables[file_name] = tb.set_index(["Year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=tables.values(), check_variables_metadata=True)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/epa/2024-04-17/ghg_concentration.py b/etl/steps/data/meadow/epa/2024-04-17/ghg_concentration.py
new file mode 100644
index 00000000000..0d60fd23ba1
--- /dev/null
+++ b/etl/steps/data/meadow/epa/2024-04-17/ghg_concentration.py
@@ -0,0 +1,69 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load data from snapshots.
+    snap_co2 = paths.load_snapshot("co2_concentration.csv")
+    snap_ch4 = paths.load_snapshot("ch4_concentration.csv")
+    snap_n2o = paths.load_snapshot("n2o_concentration.csv")
+    tb_co2 = snap_co2.read(skiprows=6)
+    tb_ch4 = snap_ch4.read(skiprows=6)
+    tb_n2o = snap_n2o.read(skiprows=6)
+
+    #
+    # Process data.
+    #
+    # Remove first row, which simply says "Ice core measurements".
+    assert tb_co2.iloc[0, 0] == "Ice core measurements"
+    assert tb_ch4.iloc[0, 0] == "Ice core measurements"
+    assert tb_n2o.iloc[0, 0] == "Ice core measurements"
+    tb_co2 = tb_co2.iloc[1:].reset_index(drop=True)
+    tb_ch4 = tb_ch4.iloc[1:].reset_index(drop=True)
+    tb_n2o = tb_n2o.iloc[1:].reset_index(drop=True)
+
+    # For convenience, rename year column.
+    tb_co2 = tb_co2.rename(columns={"Year": "year"}, errors="raise")
+    tb_ch4 = tb_ch4.rename(columns={"Year (negative values = BC)": "year"}, errors="raise")
+    tb_n2o = tb_n2o.rename(columns={"Year (negative values = BC)": "year"}, errors="raise")
+
+    # Remove row that contains just the text "Direct measurements".
+    tb_co2 = tb_co2[tb_co2["year"] != "Direct measurements"].reset_index(drop=True)
+    tb_ch4 = tb_ch4[tb_ch4["year"] != "Direct measurements"].reset_index(drop=True)
+    tb_n2o = tb_n2o[tb_n2o["year"] != "Direct measurements"].reset_index(drop=True)
+
+    # Convert year column to a float.
+    tb_co2["year"] = tb_co2["year"].astype(float)
+    tb_ch4["year"] = tb_ch4["year"].astype(float)
+    tb_n2o["year"] = tb_n2o["year"].astype(float)
+
+    # Set an appropriate name to each table.
+    tb_co2.metadata.short_name = "co2_concentration"
+    tb_ch4.metadata.short_name = "ch4_concentration"
+    tb_n2o.metadata.short_name = "n2o_concentration"
+
+    # Remove spurious empty row (with a repeated year 1988) in co2 concentration.
+    tb_co2 = tb_co2.dropna(subset=[column for column in tb_co2.columns if column != "year"], how="all").reset_index(
+        drop=True
+    )
+
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    tb_co2 = tb_co2.underscore().set_index(["year"], verify_integrity=True).sort_index()
+    tb_ch4 = tb_ch4.underscore().set_index(["year"], verify_integrity=True).sort_index()
+    tb_n2o = tb_n2o.underscore().set_index(["year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(
+        dest_dir, tables=[tb_co2, tb_ch4, tb_n2o], check_variables_metadata=True, default_metadata=snap_co2.metadata
+    )
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/epa/2024-04-17/ice_sheet_mass_balance.py b/etl/steps/data/meadow/epa/2024-04-17/ice_sheet_mass_balance.py
new file mode 100644
index 00000000000..34eafb7f4a4
--- /dev/null
+++ b/etl/steps/data/meadow/epa/2024-04-17/ice_sheet_mass_balance.py
@@ -0,0 +1,31 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load data from snapshots.
+    tb = paths.load_snapshot("ice_sheet_mass_balance.csv").read(skiprows=6)
+
+    #
+    # Process data.
+    #
+    # Ensure all columns are snake-case.
+    tb = tb.underscore()
+
+    # Set an appropriate index and sort conveniently.
+    # NOTE: There are multiple rows for the same year. This will be fixed in the garden step.
+    tb = tb.set_index(["year"], verify_integrity=False).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/epa/2024-04-17/mass_balance_us_glaciers.py b/etl/steps/data/meadow/epa/2024-04-17/mass_balance_us_glaciers.py
new file mode 100644
index 00000000000..1d25e1272bc
--- /dev/null
+++ b/etl/steps/data/meadow/epa/2024-04-17/mass_balance_us_glaciers.py
@@ -0,0 +1,27 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load data from snapshots.
+    tb = paths.load_snapshot("mass_balance_us_glaciers.csv").read(skiprows=6)
+
+    #
+    # Process data.
+    #
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["Year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/etl/steps/data/meadow/epa/2024-04-17/ocean_heat_content.py b/etl/steps/data/meadow/epa/2024-04-17/ocean_heat_content.py
new file mode 100644
index 00000000000..18523609037
--- /dev/null
+++ b/etl/steps/data/meadow/epa/2024-04-17/ocean_heat_content.py
@@ -0,0 +1,63 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Names of snapshot files to load and process.
+FILES = [
+    "ocean_heat_content_annual_world_700m",
+    "ocean_heat_content_annual_world_2000m",
+]
+
+# Columns to select from data, and how to rename them.
+COLUMNS_OCEAN_HEAT = {
+    "Year": "year",
+    # Data available for both 700m and 2000m.
+    "IAP": "ocean_heat_content_iap",
+    "MRI/JMA": "ocean_heat_content_mri",
+    "NOAA": "ocean_heat_content_noaa",
+    # Only available for 700m.
+    "CSIRO": "ocean_heat_content_csiro",
+}
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load data from snapshots.
+    tables = []
+    for file_name in FILES:
+        # Extract depth and location from file name.
+        depth = int(file_name.split("_")[-1].replace("m", ""))
+        location = file_name.split("_")[-2].title()
+        # Read data, select and rename columns.
+        new_table = (
+            paths.load_snapshot(f"{file_name}.csv")
+            .read(skiprows=6, encoding_errors="ignore")
+            .rename(columns=COLUMNS_OCEAN_HEAT, errors="ignore")
+        )
+        # Add columns for location and depth.
+        new_table = new_table.assign(**{"depth": depth, "location": location})
+        # Add annual table to list.
+        tables.append(new_table)
+
+    #
+    # Process data.
+    #
+    # Combine data.
+    tb = pr.concat(tables, short_name=paths.short_name)
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["location", "depth", "year"], verify_integrity=True).sort_index()
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True)
+    ds_meadow.save()
diff --git a/snapshots/climate/2024-04-17/ch4_concentration_monthly.csv.dvc b/snapshots/climate/2024-04-17/ch4_concentration_monthly.csv.dvc
new file mode 100644
index 00000000000..558c6f672b5
--- /dev/null
+++ b/snapshots/climate/2024-04-17/ch4_concentration_monthly.csv.dvc
@@ -0,0 +1,23 @@
+meta:
+  origin:
+    producer: NOAA Global Monitoring Laboratory
+    title: Trends in Atmospheric Methane
+    description: |-
+      The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution.
+    citation_full: |-
+      National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Methane.
+
+      Lan, X., K.W. Thoning, and E.J. Dlugokencky: Trends in globally-averaged CH4, N2O, and SF6 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/P8XG-AA10
+    attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Methane (2024)
+    attribution_short: NOAA/GML
+    url_main: https://gml.noaa.gov/ccgg/trends_ch4/
+    url_download: https://gml.noaa.gov/webdata/ccgg/trends/ch4/ch4_mm_gl.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-05'
+    license:
+      name: CC BY 4.0
+      url: https://gml.noaa.gov/about/disclaimer.html
+outs:
+  - md5: ca470fcb8b5e2b37b128230f6530422f
+    size: 22727
+    path: ch4_concentration_monthly.csv
diff --git a/snapshots/climate/2024-04-17/climate_change_impacts.py b/snapshots/climate/2024-04-17/climate_change_impacts.py
new file mode 100644
index 00000000000..61063ec48b8
--- /dev/null
+++ b/snapshots/climate/2024-04-17/climate_change_impacts.py
@@ -0,0 +1,202 @@
+"""Script to create a snapshot for each of the climate change datasets that have regular updates.
+
+The publication date will be automatically extracted from the source website, if possible, and otherwise it will be
+assumed to be the same as the access date. These dates will be written to the metadata dvc files.
+
+"""
+
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import click
+import requests
+from bs4 import BeautifulSoup
+from structlog import get_logger
+
+from etl.snapshot import Snapshot
+
+log = get_logger()
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+# Names of data files.
+FILES = [
+    # NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis.
+    # NOTE: Publication date cannot be automatically extracted.
+    "surface_temperature_analysis_world.csv",
+    "surface_temperature_analysis_northern_hemisphere.csv",
+    "surface_temperature_analysis_southern_hemisphere.csv",
+    # National Snow and Ice Data Center - Sea Ice Index.
+    "sea_ice_index.xlsx",
+    # Met Office Hadley Centre - HadSST.
+    "sea_surface_temperature_world.csv",
+    "sea_surface_temperature_northern_hemisphere.csv",
+    "sea_surface_temperature_southern_hemisphere.csv",
+    # NOAA National Centers for Environmental Information - Ocean Heat Content.
+    # NOTE: Publication date cannot be automatically extracted.
+    "ocean_heat_content_monthly_world_700m.csv",
+    "ocean_heat_content_monthly_world_2000m.csv",
+    "ocean_heat_content_annual_world_700m.csv",
+    "ocean_heat_content_annual_world_2000m.csv",
+    # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series.
+    "hawaii_ocean_time_series.csv",
+    # Rutgers University Global Snow Lab - Snow Cover Extent.
+    # NOTE: Publication date cannot be automatically extracted. But they seem to have regular updates (even daily).
+    "snow_cover_extent_north_america.csv",
+    "snow_cover_extent_northern_hemisphere.csv",
+    # NOAA Global Monitoring Laboratory.
+    "co2_concentration_monthly.csv",
+    "ch4_concentration_monthly.csv",
+    "n2o_concentration_monthly.csv",
+]
+
+########################################################################################################################
+# Other possible datasets to include:
+# * Ocean heat content data from MRI/JMA. We have this data as part of the EPA ocean heat content compilation.
+#   But in the following link, they claim the data is updated every year, so it could be added to our yearly data.
+#   https://www.data.jma.go.jp/gmd/kaiyou/english/ohc/ohc_global_en.html
+# * Rutgers University Global Snow Lab also includes snow cover extent for:
+#   * Eurasia: https://climate.rutgers.edu/snowcover/files/moncov.eurasia.txt
+#   * North America (excluding Greenland): https://climate.rutgers.edu/snowcover/files/moncov.nam.txt
+# * Ice sheet mass balance from NASA EarthData. This is regularly updated, but to access it one has to manually log in.
+#   The data can be manually accessed from:
+#   https://climate.nasa.gov/vital-signs/ice-sheets/
+#   By clicking on the HTTP link. This leads to a manual log in page.
+#   Once logged in, the data is accessible via the following link:
+#   https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/ANTARCTICA_MASS_TELLUS_MASCON_CRI_TIME_SERIES_RL06.1_V3/antarctica_mass_200204_202310.txt
+#   So, one could use this link, trying with different dates (e.g. ..._202401.txt, ..._202312.txt, ..._202311.txt),
+#   until the most recent file is downloaded.
+#   I contacted EarthData to ask if there is any way to access the latest data programmatically.
+# * Global sea level from NASA.
+#   We could get more up-to-date data on sea levels from https://sealevel.jpl.nasa.gov/
+#   but we would need to use a special library with credentials to fetch the data (and the baseline and format would
+#   probably be different).
+########################################################################################################################
+
+
+def find_date_published(snap: Snapshot) -> Optional[str]:
+    # Extract publication date for each individual origin, if possible.
+    # Otherwise, assign the current access date as publication date.
+    if snap.path.name == "sea_ice_index.xlsx":
+        # * For sea_ice_index, the date_published can be found on:
+        #   https://noaadata.apps.nsidc.org/NOAA/G02135/seaice_analysis/
+        #   Next to the file name (Sea_Ice_Index_Monthly_Data_by_Year_G02135_v3.0.xlsx).
+
+        # Extract all the text in the web page.
+        url = "/".join(snap.metadata.origin.url_download.split("/")[:-1])  # type: ignore
+        response = requests.get(url)
+        # Parse HTML content.
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # Fetch the date that is written next to the title.
+        for line in soup.text.split("\n"):
+            if "Sea_Ice_Index_Monthly_Data_by_Year" in line:
+                dates = re.findall(r"\d{2}-\w{3}-\d{4}", line)
+                if len(dates) == 1:
+                    # Format date conveniently.
+                    date = datetime.strptime(dates[0], "%d-%b-%Y").strftime("%Y-%m-%d")
+                    return date
+                else:
+                    log.warn(f"Failed to extract date_published for: {snap.path.name}")
+
+    elif snap.path.name.startswith("sea_surface_temperature_"):
+        # * For sea_surface_temperature_* the date_published can be found on:
+        #   https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html
+
+        # Extract all the text in the web page.
+        url = snap.metadata.origin.url_download.split("/data/")[0] + "/data/download.html"  # type: ignore
+        response = requests.get(url)
+        # Parse HTML content.
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        for line in soup.text.split("\n"):
+            # At the bottom of the page, there is a line like "Last updated: 09/01/2024 Expires: 09/01/2025".
+            if "Last updated" in line:
+                dates = re.findall(r"\d{2}/\d{2}/\d{4}", line)
+                if len(dates) == 2:
+                    # Format date conveniently.
+                    date = datetime.strptime(dates[0], "%d/%m/%Y").strftime("%Y-%m-%d")
+                    return date
+                else:
+                    log.warn(f"Failed to extract date_published for: {snap.path.name}")
+
+    elif snap.path.name == "hawaii_ocean_time_series.csv":
+        # * For the Hawaii Ocean Time-Series, the date_published can be found written on the header of the data itself:
+        #   https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2.txt
+
+        # Extract text from data file.
+        url = snap.metadata.origin.url_download  # type: ignore
+        response = requests.get(url)
+        for line in response.text.split("\n"):
+            # At the top of the file, there is a line like "Last updated 11 December 2023 by J.E. Dore".
+            if "Last updated" in line:
+                # Regular expression to extract the date
+                dates = re.findall(r"\d{1,2}\s+\w+\s+\d{4}", line)
+                if len(dates) == 1:
+                    # Format date conveniently.
+                    date = datetime.strptime(dates[0], "%d %B %Y").strftime("%Y-%m-%d")
+                    return date
+                else:
+                    log.warn(f"Failed to extract date_published for: {snap.path.name}")
+
+    elif "_concentration" in snap.path.name:
+        # * For NOAA GML concentration data, the date_published can be found in the header of each data file.
+        # The date is in a line like "# File Creation: Fri Jan  5 03:55:24 2024".
+
+        # Extract text from data file.
+        url = snap.metadata.origin.url_download  # type: ignore
+        response = requests.get(url)
+        for line in response.text.split("\n"):
+            # At the top of the file, there is a line like "Last updated 11 December 2023 by J.E. Dore".
+            if "File Creation" in line:
+                # Regular expression to extract the date
+                dates = re.findall(r"\w{3}\s\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2}\s\d{4}", line)
+                if len(dates) == 1:
+                    # Format date conveniently.
+                    date = datetime.strptime(dates[0], "%a %b %d %H:%M:%S %Y").strftime("%Y-%m-%d")
+                    return date
+                else:
+                    log.warn(f"Failed to extract date_published for: {snap.path.name}")
+
+    # In all other cases, assume date_published is the same as date_accessed.
+    return snap.metadata.origin.date_accessed  # type: ignore
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot metadata dvc files for each of the data files.
+    for file_name in FILES:
+        snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/{file_name}")
+
+        # To ease the recurrent task update, fetch the access date from the version, and write it to the dvc files.
+        snap.metadata.origin.date_accessed = SNAPSHOT_VERSION  # type: ignore
+
+        # Extract publication date, if possible, and otherwise assume it is the same as the access date.
+        snap.metadata.origin.date_published = find_date_published(snap=snap)  # type: ignore
+
+        # Extract publication year from date_published (which will be used in the custom attribution).
+        year_published = snap.metadata.origin.date_published.split("-")[0]  # type: ignore
+
+        # Assign a custom attribution.
+        snap.metadata.origin.attribution = (  # type: ignore
+            f"{snap.metadata.origin.producer} - {snap.metadata.origin.title} ({year_published})"  # type: ignore
+        )
+
+        # Rewrite metadata to dvc file.
+        snap.metadata_path.write_text(snap.metadata.to_yaml())
+
+    # Create the actual snapshots, download the data and upload them to S3.
+    # NOTE: This cannot be done as part of the previous loop because, if the folder of dvc files has been manually
+    # duplicated (without manually removing the "outs" section), `create_snapshot` will fail because there are multiple
+    # files with the same "outs". Therefore, we first clean the dvc files, and then run `create_snapshot`.
+    for file_name in FILES:
+        snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/{file_name}")
+        snap.create_snapshot(upload=upload)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/climate/2024-04-17/co2_concentration_monthly.csv.dvc b/snapshots/climate/2024-04-17/co2_concentration_monthly.csv.dvc
new file mode 100644
index 00000000000..bf751f6a143
--- /dev/null
+++ b/snapshots/climate/2024-04-17/co2_concentration_monthly.csv.dvc
@@ -0,0 +1,23 @@
+meta:
+  origin:
+    producer: NOAA Global Monitoring Laboratory
+    title: Trends in Atmospheric Carbon Dioxide
+    description: |-
+      The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution.
+    citation_full: |-
+      National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Carbon Dioxide.
+
+      Lan, X., Tans, P. and K.W. Thoning: Trends in globally-averaged CO2 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/9N0H-ZH07
+    attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Carbon Dioxide (2024)
+    attribution_short: NOAA/GML
+    url_main: https://gml.noaa.gov/ccgg/trends/gl_data.html
+    url_download: https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_mm_gl.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-05'
+    license:
+      name: CC BY 4.0
+      url: https://gml.noaa.gov/about/disclaimer.html
+outs:
+  - md5: 199c8b9b8760b4330ff337b3e167bc86
+    size: 23557
+    path: co2_concentration_monthly.csv
diff --git a/snapshots/climate/2024-04-17/hawaii_ocean_time_series.csv.dvc b/snapshots/climate/2024-04-17/hawaii_ocean_time_series.csv.dvc
new file mode 100644
index 00000000000..28f5a5def38
--- /dev/null
+++ b/snapshots/climate/2024-04-17/hawaii_ocean_time_series.csv.dvc
@@ -0,0 +1,25 @@
+meta:
+  origin:
+    producer: School of Ocean & Earth Science & Technology
+    title: Hawaii Ocean Time-series
+    citation_full: |-
+      School of Ocean and Earth Science and Technology at the University of Hawai'i at Manoa - Hawaii Ocean Time-series (HOT).
+
+      Dore, J.E., R. Lukas, D.W. Sadler, M.J. Church, and D.M. Karl. 2009. Physical and biogeochemical modulation of ocean acidification in the central North Pacific. Proc Natl Acad Sci USA 106:12235-12240.
+
+      HOT observations are supported by the U.S. National Science Foundation under Award #1756517.
+
+      More details can be found at [the HOT Carbon Dioxide page](https://hahana.soest.hawaii.edu/hot/hotco2/hotco2.html), specifically in [this technical document](https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2_readme.pdf).
+    attribution: School of Ocean & Earth Science & Technology - Hawaii Ocean Time-series (2023)
+    attribution_short: SOEST/Hawaii
+    url_main: https://hahana.soest.hawaii.edu/hot/
+    url_download: https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2.txt
+    date_accessed: '2024-04-17'
+    date_published: '2023-12-11'
+    license:
+      name: Public domain
+      url: https://hahana.soest.hawaii.edu/hot/dataaccess.html
+outs:
+  - md5: fd502d28aa85a6f241e9507d85b8ca8b
+    size: 44820
+    path: hawaii_ocean_time_series.csv
diff --git a/snapshots/climate/2024-04-17/n2o_concentration_monthly.csv.dvc b/snapshots/climate/2024-04-17/n2o_concentration_monthly.csv.dvc
new file mode 100644
index 00000000000..2babffd8562
--- /dev/null
+++ b/snapshots/climate/2024-04-17/n2o_concentration_monthly.csv.dvc
@@ -0,0 +1,23 @@
+meta:
+  origin:
+    producer: NOAA Global Monitoring Laboratory
+    title: Trends in Atmospheric Nitrous Oxide
+    description: |-
+      The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution.
+    citation_full: |-
+      National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Nitrous Oxide.
+
+      Lan, X., K.W. Thoning, and E.J. Dlugokencky: Trends in globally-averaged CH4, N2O, and SF6 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/P8XG-AA10
+    attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Nitrous Oxide (2024)
+    attribution_short: NOAA/GML
+    url_main: https://gml.noaa.gov/ccgg/trends_n2o/
+    url_download: https://gml.noaa.gov/webdata/ccgg/trends/n2o/n2o_mm_gl.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-05'
+    license:
+      name: CC BY 4.0
+      url: https://gml.noaa.gov/about/disclaimer.html
+outs:
+  - md5: f4700ba6a658afc72b462ecdf81eb3b0
+    size: 13301
+    path: n2o_concentration_monthly.csv
diff --git a/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc b/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc
new file mode 100644
index 00000000000..df9d773a1df
--- /dev/null
+++ b/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc
@@ -0,0 +1,27 @@
+meta:
+  origin:
+    producer: NOAA National Centers for Environmental Information
+    title: Heat Content Basin Time Series
+    description: |-
+      The time series of yearly heat content are presented for the 0-700 and 0-2000 meters layers.
+
+      The yearly data for each of four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data.html).
+    title_snapshot: Heat Content Basin Time Series - World 0 to 2000 meters
+    citation_full: |-
+      National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series.
+
+      Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp.
+    attribution: NOAA National Centers for Environmental Information - Heat Content Basin Time Series (2024)
+    attribution_short: NOAA/NCEI
+    url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level
+    url_download: https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/yearly/h22-w0-2000m.dat
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+      url: |-
+        https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf
+outs:
+  - md5: af13e73414f3cde4a2326156cf385d35
+    size: 1140
+    path: ocean_heat_content_annual_world_2000m.csv
diff --git a/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc b/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc
new file mode 100644
index 00000000000..89399d402fa
--- /dev/null
+++ b/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc
@@ -0,0 +1,27 @@
+meta:
+  origin:
+    producer: NOAA National Centers for Environmental Information
+    title: Heat Content Basin Time Series
+    description: |-
+      The time series of yearly heat content are presented for the 0-700 and 0-2000 meters layers.
+
+      The yearly data for each of four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data.html).
+    title_snapshot: Heat Content Basin Time Series - World 0 to 700 meters
+    citation_full: |-
+      National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series.
+
+      Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp.
+    attribution: NOAA National Centers for Environmental Information - Heat Content Basin Time Series (2024)
+    attribution_short: NOAA/NCEI
+    url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level
+    url_download: https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/yearly/h22-w0-700m.dat
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+      url: |-
+        https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf
+outs:
+  - md5: ef1fff5b0e82b86383acb1e455ea00e5
+    size: 3990
+    path: ocean_heat_content_annual_world_700m.csv
diff --git a/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv.dvc b/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv.dvc
new file mode 100644
index 00000000000..3c9c86c0992
--- /dev/null
+++ b/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv.dvc
@@ -0,0 +1,28 @@
+meta:
+  origin:
+    producer: NOAA National Centers for Environmental Information
+    title: Heat Content Monthly Basin Time Series
+    description: |-
+      The time series of monthly heat content are presented for the 0-700 and 0-2000 meters layers.
+
+      The monthly data for each of the four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data_monthly.html).
+    title_snapshot: Heat Content Monthly Basin Time Series - World 0 to 2000 meters
+    citation_full: |-
+      National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series.
+
+      Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp.
+    attribution: NOAA National Centers for Environmental Information - Heat Content Monthly Basin Time Series (2024)
+    attribution_short: NOAA/NCEI
+    url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level
+    url_download: |-
+      https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/onemonth/ohc2000m_levitus_climdash_monthly.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+      url: |-
+        https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf
+outs:
+  - md5: 213b26eae75df619c734b0b65db8a105
+    size: 3929
+    path: ocean_heat_content_monthly_world_2000m.csv
diff --git a/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv.dvc b/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv.dvc
new file mode 100644
index 00000000000..4c05da77f2c
--- /dev/null
+++ b/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv.dvc
@@ -0,0 +1,28 @@
+meta:
+  origin:
+    producer: NOAA National Centers for Environmental Information
+    title: Heat Content Monthly Basin Time Series
+    description: |-
+      The time series of monthly heat content are presented for the 0-700 and 0-2000 meters layers.
+
+      The monthly data for each of the four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data_monthly.html).
+    title_snapshot: Heat Content Monthly Basin Time Series - World 0 to 700 meters
+    citation_full: |-
+      National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series.
+
+      Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp.
+    attribution: NOAA National Centers for Environmental Information - Heat Content Monthly Basin Time Series (2024)
+    attribution_short: NOAA/NCEI
+    url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level
+    url_download: |-
+      https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/onemonth/ohc_levitus_climdash_monthly.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+      url: |-
+        https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf
+outs:
+  - md5: 408affa84dbb88e83601a0f598314092
+    size: 3882
+    path: ocean_heat_content_monthly_world_700m.csv
diff --git a/snapshots/climate/2024-04-17/sea_ice_index.xlsx.dvc b/snapshots/climate/2024-04-17/sea_ice_index.xlsx.dvc
new file mode 100644
index 00000000000..8504ef7648b
--- /dev/null
+++ b/snapshots/climate/2024-04-17/sea_ice_index.xlsx.dvc
@@ -0,0 +1,19 @@
+meta:
+  origin:
+    producer: National Snow and Ice Data Center
+    title: Sea Ice Index
+    citation_full: |-
+      Fetterer, F., K. Knowles, W. N. Meier, M. Savoie, and A. K. Windnagel. (2017). Sea Ice Index, Version 3 [Data Set]. Boulder, Colorado USA. National Snow and Ice Data Center. https://doi.org/10.7265/N5K072F8.
+    attribution: National Snow and Ice Data Center - Sea Ice Index (2024)
+    attribution_short: NSIDC
+    version_producer: Version 3
+    url_main: https://nsidc.org/data/g02135/
+    url_download: https://noaadata.apps.nsidc.org/NOAA/G02135/seaice_analysis/Sea_Ice_Index_Monthly_Data_by_Year_G02135_v3.0.xlsx
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-16'
+    license:
+      name: CC BY 4.0
+outs:
+  - md5: 7789e0f09255f745fa15e726569287fc
+    size: 25191
+    path: sea_ice_index.xlsx
diff --git a/snapshots/climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv.dvc
new file mode 100644
index 00000000000..c358137828e
--- /dev/null
+++ b/snapshots/climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv.dvc
@@ -0,0 +1,26 @@
+meta:
+  origin:
+    producer: Met Office Hadley Centre
+    title: Hadley Centre's Sea Surface Temperature (HadSST)
+    title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - Northern hemisphere
+    citation_full: |-
+      Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST).
+
+      Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R.
+      E. (2019). An ensemble data set of sea-surface temperature change from 1850:
+      the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical
+      Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867
+    attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024)
+    attribution_short: Met Office
+    version_producer: 4.0.1.0
+    url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/
+    url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_NHEM.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-02-19'
+    license:
+      name: Open Government Licence v3
+      url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html
+outs:
+  - md5: bebd99188dcd69a62a9264cde07274fc
+    size: 152748
+    path: sea_surface_temperature_northern_hemisphere.csv
diff --git a/snapshots/climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv.dvc
new file mode 100644
index 00000000000..a72ec7d2896
--- /dev/null
+++ b/snapshots/climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv.dvc
@@ -0,0 +1,26 @@
+meta:
+  origin:
+    producer: Met Office Hadley Centre
+    title: Hadley Centre's Sea Surface Temperature (HadSST)
+    title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - Southern hemisphere
+    citation_full: |-
+      Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST).
+
+      Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R.
+      E. (2019). An ensemble data set of sea-surface temperature change from 1850:
+      the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical
+      Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867
+    attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024)
+    attribution_short: Met Office
+    version_producer: 4.0.1.0
+    url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/
+    url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_SHEM.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-02-19'
+    license:
+      name: Open Government Licence v3
+      url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html
+outs:
+  - md5: fdddc1523485ea4688c1c38fe067353b
+    size: 153344
+    path: sea_surface_temperature_southern_hemisphere.csv
diff --git a/snapshots/climate/2024-04-17/sea_surface_temperature_world.csv.dvc b/snapshots/climate/2024-04-17/sea_surface_temperature_world.csv.dvc
new file mode 100644
index 00000000000..1e91922d6d9
--- /dev/null
+++ b/snapshots/climate/2024-04-17/sea_surface_temperature_world.csv.dvc
@@ -0,0 +1,26 @@
+meta:
+  origin:
+    producer: Met Office Hadley Centre
+    title: Hadley Centre's Sea Surface Temperature (HadSST)
+    title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - World
+    citation_full: |-
+      Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST).
+
+      Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R.
+      E. (2019). An ensemble data set of sea-surface temperature change from 1850:
+      the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical
+      Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867
+    attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024)
+    attribution_short: Met Office
+    version_producer: 4.0.1.0
+    url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/
+    url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_GLOBE.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-02-19'
+    license:
+      name: Open Government Licence v3
+      url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html
+outs:
+  - md5: 8d5d830f7b14ec99e377527e60d408fd
+    size: 153237
+    path: sea_surface_temperature_world.csv
diff --git a/snapshots/climate/2024-04-17/snow_cover_extent_north_america.csv.dvc b/snapshots/climate/2024-04-17/snow_cover_extent_north_america.csv.dvc
new file mode 100644
index 00000000000..76888e4302d
--- /dev/null
+++ b/snapshots/climate/2024-04-17/snow_cover_extent_north_america.csv.dvc
@@ -0,0 +1,22 @@
+meta:
+  origin:
+    producer: Rutgers University Global Snow Lab
+    title: Snow Cover Extent
+    title_snapshot: Area of Snow Extent - North America (including Greenland)
+    citation_full: |-
+      Rutgers University Global Snow Lab - Area of Snow Extent.
+
+      Robinson, David A., Estilow, Thomas W., and NOAA CDR Program (2012): NOAA Climate Data Record (CDR) of Northern Hemisphere (NH) Snow Cover Extent (SCE), Version 1. NOAA National Centers for Environmental Information. doi: 10.7289/V5N014G9
+    attribution: Rutgers University Global Snow Lab - Snow Cover Extent (2024)
+    attribution_short: Rutgers
+    version_producer: Version 1
+    url_main: https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0
+    url_download: https://climate.rutgers.edu/snowcover/files/moncov.namgnld.txt
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+outs:
+  - md5: b9a1c33040d06e34e40a50f560955f1c
+    size: 12599
+    path: snow_cover_extent_north_america.csv
diff --git a/snapshots/climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv.dvc
new file mode 100644
index 00000000000..2a2eb18fe23
--- /dev/null
+++ b/snapshots/climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv.dvc
@@ -0,0 +1,22 @@
+meta:
+  origin:
+    producer: Rutgers University Global Snow Lab
+    title: Snow Cover Extent
+    title_snapshot: Area of Snow Extent - Northern Hemisphere
+    citation_full: |-
+      Rutgers University Global Snow Lab - Area of Snow Extent.
+
+      Robinson, David A., Estilow, Thomas W., and NOAA CDR Program (2012): NOAA Climate Data Record (CDR) of Northern Hemisphere (NH) Snow Cover Extent (SCE), Version 1. NOAA National Centers for Environmental Information. doi: 10.7289/V5N014G9
+    attribution: Rutgers University Global Snow Lab - Snow Cover Extent (2024)
+    attribution_short: Rutgers
+    version_producer: Version 1
+    url_main: https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0
+    url_download: https://climate.rutgers.edu/snowcover/files/moncov.nhland.txt
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+outs:
+  - md5: f28936a4230c681bb9dbfad61d7f50a4
+    size: 12721
+    path: snow_cover_extent_northern_hemisphere.csv
diff --git a/snapshots/climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv.dvc
new file mode 100644
index 00000000000..aaa334aaf07
--- /dev/null
+++ b/snapshots/climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv.dvc
@@ -0,0 +1,19 @@
+meta:
+  origin:
+    producer: NASA Goddard Institute for Space Studies
+    title: GISS Surface Temperature Analysis
+    title_snapshot: GISS Surface Temperature Analysis - Northern hemisphere
+    citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4)
+    attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024)
+    attribution_short: NASA
+    version_producer: v4
+    url_main: https://data.giss.nasa.gov/gistemp/
+    url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/NH.Ts+dSST.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+outs:
+  - md5: 85e695cd4d096248d04dcbfcff02f1a3
+    size: 12731
+    path: surface_temperature_analysis_northern_hemisphere.csv
diff --git a/snapshots/climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv.dvc
new file mode 100644
index 00000000000..ff3335c7178
--- /dev/null
+++ b/snapshots/climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv.dvc
@@ -0,0 +1,19 @@
+meta:
+  origin:
+    producer: NASA Goddard Institute for Space Studies
+    title: GISS Surface Temperature Analysis
+    title_snapshot: GISS Surface Temperature Analysis - Southern hemisphere
+    citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4)
+    attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024)
+    attribution_short: NASA
+    version_producer: v4
+    url_main: https://data.giss.nasa.gov/gistemp/
+    url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/SH.Ts+dSST.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+outs:
+  - md5: 0e3bc94e41096b0dcd8aee8b8cd59eed
+    size: 12711
+    path: surface_temperature_analysis_southern_hemisphere.csv
diff --git a/snapshots/climate/2024-04-17/surface_temperature_analysis_world.csv.dvc b/snapshots/climate/2024-04-17/surface_temperature_analysis_world.csv.dvc
new file mode 100644
index 00000000000..5da8d0b0614
--- /dev/null
+++ b/snapshots/climate/2024-04-17/surface_temperature_analysis_world.csv.dvc
@@ -0,0 +1,19 @@
+meta:
+  origin:
+    producer: NASA Goddard Institute for Space Studies
+    title: GISS Surface Temperature Analysis
+    title_snapshot: GISS Surface Temperature Analysis - World
+    citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4)
+    attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024)
+    attribution_short: NASA
+    version_producer: v4
+    url_main: https://data.giss.nasa.gov/gistemp/
+    url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/GLB.Ts+dSST.csv
+    date_accessed: '2024-04-17'
+    date_published: '2024-04-17'
+    license:
+      name: CC BY 4.0
+outs:
+  - md5: 8b4ac958dda08f468f580a796dbe5b1c
+    size: 12690
+    path: surface_temperature_analysis_world.csv
diff --git a/snapshots/epa/2024-04-17/ch4_concentration.csv.dvc b/snapshots/epa/2024-04-17/ch4_concentration.csv.dvc
new file mode 100644
index 00000000000..29dfa5c99d4
--- /dev/null
+++ b/snapshots/epa/2024-04-17/ch4_concentration.csv.dvc
@@ -0,0 +1,29 @@
+meta:
+  origin:
+    producer: United States Environmental Protection Agency
+    title: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases'
+    description: |-
+      This indicator describes how the levels of major greenhouse gases in the atmosphere have changed over time.
+
+      The data contains concentrations of carbon dioxide in the atmosphere from hundreds of thousands of years ago through 2021, measured in parts per million (ppm). The data come from a variety of historical ice core studies and recent air monitoring sites around the world.
+    title_snapshot: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases - Methane'
+    citation_full: |-
+      United States Environmental Protection Agency (EPA) - Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases (2022)
+
+      Global atmospheric concentration measurements for carbon dioxide, methane, and nitrous oxide come from a variety of monitoring programs and studies published in peer-reviewed literature.
+
+      More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-03/documents/ghg-concentrations_td.pdf).
+    attribution: EPA based on various sources (2022)
+    attribution_short: EPA
+    url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-atmospheric-concentrations-greenhouse-gases
+    url_download: https://www.epa.gov/system/files/other-files/2022-07/ghg-concentrations_fig-2.csv
+    date_accessed: '2024-04-17'
+    date_published: '2022-07-01'
+    license:
+      name: Public domain
+      url: https://edg.epa.gov/epa_data_license.html
+wdir: ../../../data/snapshots/epa/2024-04-17
+outs:
+  - md5: e960453c271261622b4bfe2b2131755e
+    size: 48396
+    path: ch4_concentration.csv
diff --git a/snapshots/epa/2024-04-17/climate_change_indicators.py b/snapshots/epa/2024-04-17/climate_change_indicators.py
new file mode 100644
index 00000000000..6943a206f28
--- /dev/null
+++ b/snapshots/epa/2024-04-17/climate_change_indicators.py
@@ -0,0 +1,43 @@
+"""Script to create snapshots of EPA compilations of different climate change indicators.
+
+The main page is https://www.epa.gov/climate-indicators/view-indicators
+"""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+# Names of data files.
+FILES = [
+    # Ocean heat content.
+    "ocean_heat_content_annual_world_700m.csv",
+    "ocean_heat_content_annual_world_2000m.csv",
+    # Ice sheet mass balance.
+    "ice_sheet_mass_balance.csv",
+    # Greenhouse gas concentration.
+    "co2_concentration.csv",
+    "ch4_concentration.csv",
+    "n2o_concentration.csv",
+    # Cumulative mass balance of US glaciers.
+    "mass_balance_us_glaciers.csv",
+]
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot for each of the data files.
+    for file_name in FILES:
+        snap = Snapshot(f"epa/{SNAPSHOT_VERSION}/{file_name}")
+
+        # Copy local data file to snapshots data folder, add file to DVC and upload to S3.
+        snap.create_snapshot(upload=upload)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snapshots/epa/2024-04-17/co2_concentration.csv.dvc b/snapshots/epa/2024-04-17/co2_concentration.csv.dvc
new file mode 100644
index 00000000000..be6edad58e2
--- /dev/null
+++ b/snapshots/epa/2024-04-17/co2_concentration.csv.dvc
@@ -0,0 +1,29 @@
+meta:
+  origin:
+    producer: United States Environmental Protection Agency
+    title: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases'
+    description: |-
+      This indicator describes how the levels of major greenhouse gases in the atmosphere have changed over time.
+
+      The data contains concentrations of carbon dioxide in the atmosphere from hundreds of thousands of years ago through 2021, measured in parts per million (ppm). The data come from a variety of historical ice core studies and recent air monitoring sites around the world.
+    title_snapshot: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases - Carbon Dioxide'
+    citation_full: |-
+      United States Environmental Protection Agency (EPA) - Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases (2022)
+
+      Global atmospheric concentration measurements for carbon dioxide, methane, and nitrous oxide come from a variety of monitoring programs and studies published in peer-reviewed literature.
+
+      More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-03/documents/ghg-concentrations_td.pdf).
+    attribution: EPA based on various sources (2022)
+    attribution_short: EPA
+    url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-atmospheric-concentrations-greenhouse-gases
+    url_download: https://www.epa.gov/system/files/other-files/2022-07/ghg-concentrations_fig-1.csv
+    date_accessed: '2024-04-17'
+    date_published: '2022-07-01'
+    license:
+      name: Public domain
+      url: https://edg.epa.gov/epa_data_license.html
+wdir: ../../../data/snapshots/epa/2024-04-17
+outs:
+  - md5: c189619be8fc1b41144235271091309f
+    size: 59458
+    path: co2_concentration.csv
diff --git a/snapshots/epa/2024-04-17/ice_sheet_mass_balance.csv.dvc b/snapshots/epa/2024-04-17/ice_sheet_mass_balance.csv.dvc
new file mode 100644
index 00000000000..2cf75808699
--- /dev/null
+++ b/snapshots/epa/2024-04-17/ice_sheet_mass_balance.csv.dvc
@@ -0,0 +1,37 @@
+meta:
+  origin:
+    producer: United States Environmental Protection Agency
+    title: 'Climate Change Indicators: Ice Sheets'
+    description: |-
+      This dataset includes the cumulative change in mass in the ice sheets of Greenland and Antarctica since 1992.
+
+      This indicator is presented in two different ways, based on two different sources:
+      * IMBIE, an average value based on more than 20 different studies.
+        * The uncertainty estimates from the various datasets that feed into the combined average.
+      * NASA JPL: A commonly cited analysis by NASA JPL, which includes seasonal variations.
+
+      All estimates are centered at zero in 2002 to provide a consistent point of reference. Thus, a downward slope indicates a net loss of ice and snow.
+      For reference, 1,000 billion metric tons is equal to about 260 cubic miles of ice—enough to raise sea level by about 3 millimeters.
+    citation_full: |-
+      United States Environmental Protection Agency (EPA) - Climate Change Indicators: Ice Sheets (2021)
+
+      Full citation of the original sources:
+      * IMBIE (Ice sheet Mass Balance Inter-comparison Exercise team). 2018. Mass balance of the Antarctic Ice Sheet from 1992 to 2017. Nature 558:219-222. doi:10.1038/s41586-018-0179-y
+      * IMBIE (Ice sheet Mass Balance Inter-comparison Exercise team). 2020. Mass balance of the Greenland Ice sheet from 1992 to 2018. Nature in press. doi:10.1038/s41586-019-1855-2
+      * NASA (National Aeronautics and Space Administration). 2021. Vital signs: Land ice. Antarctica and Greenland mass variation since 2002. Accessed March 2021. https://climate.nasa.gov/vital-signs/land-ice
+
+      More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-04/documents/ice-sheets_td.pdf).
+    attribution: EPA based on various sources (2021)
+    attribution_short: EPA
+    url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-ice-sheets
+    url_download: https://www.epa.gov/sites/default/files/2021-04/ice_sheets_fig-1.csv
+    date_accessed: '2024-04-17'
+    date_published: '2021-04-01'
+    license:
+      name: Public domain
+      url: https://edg.epa.gov/epa_data_license.html
+wdir: ../../../data/snapshots/epa/2024-04-17
+outs:
+  - md5: 4855164ffa55ddf71df690fadceac94c
+    size: 25986
+    path: ice_sheet_mass_balance.csv
diff --git a/snapshots/epa/2024-04-17/mass_balance_us_glaciers.csv.dvc b/snapshots/epa/2024-04-17/mass_balance_us_glaciers.csv.dvc
new file mode 100644
index 00000000000..f8fe91c1617
--- /dev/null
+++ b/snapshots/epa/2024-04-17/mass_balance_us_glaciers.csv.dvc
@@ -0,0 +1,28 @@
+meta:
+  origin:
+    producer: United States Environmental Protection Agency
+    title: 'Climate Change Indicators: Glaciers'
+    description: |-
+      This dataset examines the balance between snow accumulation and melting in glaciers, and it describes how glaciers have changed over time.
+
+    title_snapshot: 'Climate Change Indicators: Glaciers - US Glaciers'
+    citation_full: |-
+      United States Environmental Protection Agency (EPA) - Climate Change Indicators: Glaciers (2021)
+
+      The underlying data comes from the United States Geological Survey (USGS). Glacier-wide mass balance and compiled data inputs: USGS benchmark glaciers (ver. 4.0, November 2019). Accessed December 2020. https://alaska.usgs.gov/products/data.php?dataid=79. doi:10.5066/F7HD7SRF.
+
+      More details can be found on [the EPA's technical documentation](https://www.epa.gov/sites/default/files/2021-03/documents/glaciers_td.pdf).
+    attribution: EPA based on various sources (2021)
+    attribution_short: EPA
+    url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-glaciers
+    url_download: https://www.epa.gov/sites/default/files/2021-03/glaciers_fig-2.csv
+    date_accessed: '2024-04-17'
+    date_published: '2021-04-01'
+    license:
+      name: Public domain
+      url: https://edg.epa.gov/epa_data_license.html
+wdir: ../../../data/snapshots/epa/2024-04-17
+outs:
+  - md5: 98571ba4a0568f2f3ab78bc2aa9c9c3f
+    size: 2257
+    path: mass_balance_us_glaciers.csv
diff --git a/snapshots/epa/2024-04-17/n2o_concentration.csv.dvc b/snapshots/epa/2024-04-17/n2o_concentration.csv.dvc
new file mode 100644
index 00000000000..b209fbf30ac
--- /dev/null
+++ b/snapshots/epa/2024-04-17/n2o_concentration.csv.dvc
@@ -0,0 +1,29 @@
+meta:
+  origin:
+    producer: United States Environmental Protection Agency
+    title: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases'
+    description: |-
+      This indicator describes how the levels of major greenhouse gases in the atmosphere have changed over time.
+
+      The data contains concentrations of carbon dioxide in the atmosphere from hundreds of thousands of years ago through 2021, measured in parts per million (ppm). The data come from a variety of historical ice core studies and recent air monitoring sites around the world.
+    title_snapshot: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases - Nitrous Oxide'
+    citation_full: |-
+      United States Environmental Protection Agency (EPA) - Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases (2022)
+
+      Global atmospheric concentration measurements for carbon dioxide, methane, and nitrous oxide come from a variety of monitoring programs and studies published in peer-reviewed literature.
+
+      More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-03/documents/ghg-concentrations_td.pdf).
+    attribution: EPA based on various sources (2022)
+    attribution_short: EPA
+    url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-atmospheric-concentrations-greenhouse-gases
+    url_download: https://www.epa.gov/system/files/other-files/2022-07/ghg-concentrations_fig-3.csv
+    date_accessed: '2024-04-17'
+    date_published: '2022-07-01'
+    license:
+      name: Public domain
+      url: https://edg.epa.gov/epa_data_license.html
+wdir: ../../../data/snapshots/epa/2024-04-17
+outs:
+  - md5: b2bcc42a26e9d7fb47bb9c107fe60f66
+    size: 19470
+    path: n2o_concentration.csv
diff --git a/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc b/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc
new file mode 100644
index 00000000000..d405db0f89c
--- /dev/null
+++ b/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc
@@ -0,0 +1,32 @@
+meta:
+  origin:
+    producer: United States Environmental Protection Agency
+    title: 'Climate Change Indicators: Ocean Heat'
+    title_snapshot: 'Climate Change Indicators: Ocean Heat - World 0 to 2000 meters'
+    citation_full: |-
+      United States Environmental Protection Agency (EPA) - Climate Change Indicators: Ocean Heat (2021)
+
+      Data for this indicator were collected by the National Oceanic and Atmospheric Administration (NOAA) and other organizations around the world. The data were analyzed independently by researchers at NOAA, Australia's Commonwealth Scientific and Industrial Research Organisation (CSIRO), China's Institute of Atmospheric Physics (IAP), and the Japan Meteorological Agency's Meteorological Research Institute (MRI/JMA).
+
+      Full citation of the original sources:
+      * NOAA (National Oceanic and Atmospheric Administration). 2021. Global ocean heat and salt content. Accessed February 2021. www.nodc.noaa.gov/OC5/3M_HEAT_CONTENT . Based on:
+        * Levitus, S., J.I. Antonov, T.P. Boyer, O.K. Baranova, H.E. Garcia, R.A. Locarnini, A.V. Mishonov, J.R. Reagan, D. Seidov, E.S. Yarosh, and M.M. Zweng. 2012. World ocean heat content and thermosteric sea level change (0-2000 m), 1955-2010. Geophys. Res. Lett. 39:L10603. https://www.ncei.noaa.gov/data/oceans/woa/PUBLICATIONS/grlheat12.pdf
+      * CSIRO (Commonwealth Scientific and Industrial Research Organisation). 2016 update to data originally published in: Domingues, C.M., J.A. Church, N.J. White, P.J. Gleckler, S.E. Wijffels, P.M. Barker, and J.R. Dunn. 2008. Improved estimates of upper-ocean warming and multi-decadal sea-level rise. Nature 453:1090-1094. www.cmar.csiro.au/sealevel/thermal_expansion_ocean_heat_timeseries.html
+      * IAP (Institute of Atmospheric Physics). 2021 update to data originally published in: Cheng, L., K.E. Trenberth, J. Fasullo, T. Boyer, J. Abraham, and J. Zhu. 2017. Improved estimates of ocean heat content from 1960 to 2015. Science Advances 3(3):e1601545.
+      * MRI/JMA (Meteorological Research Institute/Japan Meteorological Agency). 2021. Global ocean heat content. Accessed February 2021. www.data.jma.go.jp/gmd/kaiyou/english/ohc/ohc_global_en.html.
+
+      More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-04/documents/ocean-heat_td.pdf).
+    attribution: EPA based on various sources (2021)
+    attribution_short: EPA
+    url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-ocean-heat
+    url_download: https://www.epa.gov/sites/default/files/2021-04/ocean-heat_fig-2.csv
+    date_accessed: '2024-04-17'
+    date_published: '2021-04-01'
+    license:
+      name: Public domain
+      url: https://edg.epa.gov/epa_data_license.html
+wdir: ../../../data/snapshots/epa/2024-04-17
+outs:
+  - md5: 56d585ef448f394150031a8f335eb373
+    size: 3152
+    path: ocean_heat_content_annual_world_2000m.csv
diff --git a/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc b/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc
new file mode 100644
index 00000000000..676bb3dba27
--- /dev/null
+++ b/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc
@@ -0,0 +1,32 @@
+meta:
+  origin:
+    producer: United States Environmental Protection Agency
+    title: 'Climate Change Indicators: Ocean Heat'
+    title_snapshot: 'Climate Change Indicators: Ocean Heat - World 0 to 700 meters'
+    citation_full: |-
+      United States Environmental Protection Agency (EPA) - Climate Change Indicators: Ocean Heat (2021)
+
+      Data for this indicator were collected by the National Oceanic and Atmospheric Administration (NOAA) and other organizations around the world. The data were analyzed independently by researchers at NOAA, Australia's Commonwealth Scientific and Industrial Research Organisation (CSIRO), China's Institute of Atmospheric Physics (IAP), and the Japan Meteorological Agency's Meteorological Research Institute (MRI/JMA).
+
+      Full citation of the original sources:
+      * NOAA (National Oceanic and Atmospheric Administration). 2021. Global ocean heat and salt content. Accessed February 2021. www.nodc.noaa.gov/OC5/3M_HEAT_CONTENT . Based on:
+        * Levitus, S., J.I. Antonov, T.P. Boyer, O.K. Baranova, H.E. Garcia, R.A. Locarnini, A.V. Mishonov, J.R. Reagan, D. Seidov, E.S. Yarosh, and M.M. Zweng. 2012. World ocean heat content and thermosteric sea level change (0-2000 m), 1955-2010. Geophys. Res. Lett. 39:L10603. https://www.ncei.noaa.gov/data/oceans/woa/PUBLICATIONS/grlheat12.pdf
+      * CSIRO (Commonwealth Scientific and Industrial Research Organisation). 2016 update to data originally published in: Domingues, C.M., J.A. Church, N.J. White, P.J. Gleckler, S.E. Wijffels, P.M. Barker, and J.R. Dunn. 2008. Improved estimates of upper-ocean warming and multi-decadal sea-level rise. Nature 453:1090-1094. www.cmar.csiro.au/sealevel/thermal_expansion_ocean_heat_timeseries.html
+      * IAP (Institute of Atmospheric Physics). 2021 update to data originally published in: Cheng, L., K.E. Trenberth, J. Fasullo, T. Boyer, J. Abraham, and J. Zhu. 2017. Improved estimates of ocean heat content from 1960 to 2015. Science Advances 3(3):e1601545.
+      * MRI/JMA (Meteorological Research Institute/Japan Meteorological Agency). 2021. Global ocean heat content. Accessed February 2021. www.data.jma.go.jp/gmd/kaiyou/english/ohc/ohc_global_en.html.
+
+      More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-04/documents/ocean-heat_td.pdf).
+    attribution: EPA based on various sources (2021)
+    attribution_short: EPA
+    url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-ocean-heat
+    url_download: https://www.epa.gov/sites/default/files/2021-04/ocean-heat_fig-1.csv
+    date_accessed: '2024-04-17'
+    date_published: '2021-04-01'
+    license:
+      name: Public domain
+      url: https://edg.epa.gov/epa_data_license.html
+wdir: ../../../data/snapshots/epa/2024-04-17
+outs:
+  - md5: 909371e229b3a4f40427659b376d5432
+    size: 3919
+    path: ocean_heat_content_annual_world_700m.csv

From 789858cce9669fd78cd3488341eeeb91f0ef1b54 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 17 Apr 2024 12:20:40 +0000
Subject: [PATCH 54/61] fasttrack: fasttrack/2024-04-17/qubits.csv

---
 .../fasttrack/2024-04-17/qubits.meta.yml      | 25 ++-----------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
index 40283d1b22d..c2bbb6cba34 100644
--- a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
+++ b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml
@@ -14,32 +14,11 @@ tables:
         short_unit: qb
         display:
           numDecimalPlaces: 0
-        description_key:
-          - '['
-          - '"'
-          - t
-          - e
-          - s
-          - t
-          - ''
-          - '1'
-          - '"'
-          - ','
-          - ''
-          - '"'
-          - t
-          - e
-          - s
-          - t
-          - ''
-          - '2'
-          - '"'
-          - ']'
+        description_short: Highest number of quantum bits in a single circuit-based quantum processor over time.
         description_from_producer: |-
-          This list contains quantum processors, also known as quantum processing units (QPUs). Some devices listed below have only been announced at press conferences so far, with no actual demonstrations or scientific publications characterizing the performance.
+          This list contains quantum processors, also known as quantum processing units (QPUs). Some devices listed have only been announced at press conferences so far, with no actual demonstrations or scientific publications characterizing the performance.
 
           Quantum processors are difficult to compare due to the different architectures and approaches. Due to this, published qubit numbers do not reflect the performance levels of the processor. This is instead achieved through benchmarking metrics such as quantum volume, randomized benchmarking or circuit layer operations per second (CLOPS).
 
           These QPUs are based on the quantum circuit and quantum logic gate-based model of computing.
         processing_level: minor
-        description: Highest number of quantum bits in a single circuit-based quantum processor over time

From e706587bfb2bd09cbe88850368eb3a3200b7ac2f Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 17 Apr 2024 13:00:40 +0000
Subject: [PATCH 55/61] fasttrack: fasttrack/2024-04-17/qubits.csv


From b93df70327b8856ecbe2bd2711f9ed4e52bbc7d4 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 17 Apr 2024 13:19:46 +0000
Subject: [PATCH 56/61] fasttrack:
 fasttrack/latest/usa_weather_climate_noaa.csv

---
 .../latest/usa_weather_climate_noaa.meta.yml  | 24 +++++++++++++++++--
 .../latest/usa_weather_climate_noaa.csv.dvc   |  6 ++---
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/etl/steps/data/grapher/fasttrack/latest/usa_weather_climate_noaa.meta.yml b/etl/steps/data/grapher/fasttrack/latest/usa_weather_climate_noaa.meta.yml
index 0e97a77fec2..15ccd45b8d6 100644
--- a/etl/steps/data/grapher/fasttrack/latest/usa_weather_climate_noaa.meta.yml
+++ b/etl/steps/data/grapher/fasttrack/latest/usa_weather_climate_noaa.meta.yml
@@ -33,13 +33,13 @@ tables:
         unit: '%'
         short_unit: '%'
         description: |-
-          Share of land area in the United States which experienced unusually high annual precipitation in a given year. This comparesactual yearly precipitation totals with expected totals based on historical data.
+          Share of land area in the United States which experienced unusually high annual precipitation in a given year. This compares actual yearly precipitation totals with expected totals based on historical data.
       unusual_precipitation_9yr:
         title: Unusually high precipitation (smoothed)
         unit: '%'
         short_unit: '%'
         description: |-
-          Share of land area in the United States which experienced unusually high annual precipitation in a given year. This comparesactual yearly precipitation totals with expected totals based on historical data.
+          Share of land area in the United States which experienced unusually high annual precipitation in a given year. This compares actual yearly precipitation totals with expected totals based on historical data.
       us_precipitation_anomaly:
         title: US precipitation anomaly
         unit: inches
@@ -129,3 +129,23 @@ tables:
         unit: °F
         short_unit: °F
         description: Temperature anomaly in fall, compared to the 1901–2000 average. "Fall" includes September to November.
+      winter_temp_anomaly_rolling:
+        title: Winter (rolling average)
+        unit: °F
+        short_unit: °F
+        description: Temperature anomaly in winter, compared to the 1901–2000 average. "Winter" includes December to February.
+      spring_temp_anomaly_rolling:
+        title: Spring (rolling average)
+        unit: °F
+        short_unit: °F
+        description: Temperature anomaly in spring, compared to the 1901–2000 average. "Spring" includes March to May.
+      fall_temp_anomaly_rolling:
+        title: Summer (rolling average)
+        unit: °F
+        short_unit: °F
+        description: Temperature anomaly in summer, compared to the 1901–2000 average. "Summer" includes June to August.
+      summer_temp_anomaly_rolling:
+        title: Fall (rolling average)
+        unit: °F
+        short_unit: °F
+        description: Temperature anomaly in fall, compared to the 1901–2000 average. "Fall" includes September to November.
diff --git a/snapshots/fasttrack/latest/usa_weather_climate_noaa.csv.dvc b/snapshots/fasttrack/latest/usa_weather_climate_noaa.csv.dvc
index 65d3256e55b..6d60b92b88c 100644
--- a/snapshots/fasttrack/latest/usa_weather_climate_noaa.csv.dvc
+++ b/snapshots/fasttrack/latest/usa_weather_climate_noaa.csv.dvc
@@ -7,11 +7,11 @@ meta:
     url_main: https://www.epa.gov/climate-indicators/view-indicators
     url_download: |-
       https://docs.google.com/spreadsheets/d/e/2PACX-1vSDHxZsuT6bFCjqagPqRQCAuSKbUvf9ag8A6BQVZU2POn6nDOh0VIo_fUfNtfI4vuQtAbqhBzSHQEMC/pub?output=csv
-    date_accessed: '2024-04-05'
+    date_accessed: '2024-04-17'
   name: US Weather and Climate (NOAA via EPA, 2024)
   description: Weather and climate indicators such as temperature, precipitation, and droughts in the United States.
   license: {}
 outs:
-  - md5: 91fa47a3534c59704e1c696635670497
-    size: 21034
+  - md5: 50e5719f78dc2b542d5cb445b993bd9c
+    size: 24436
     path: usa_weather_climate_noaa.csv

From 97b9d2b9b37d0e4890615ca7f5aee7d72465a29d Mon Sep 17 00:00:00 2001
From: Marigold <mojmir.vinkler@gmail.com>
Date: Wed, 17 Apr 2024 16:18:18 +0200
Subject: [PATCH 57/61] :hammer: use copy rather than sync to avoid deleting
 local files

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 376ff4272f1..a6255def36e 100644
--- a/Makefile
+++ b/Makefile
@@ -125,7 +125,7 @@ prune: .venv
 # or update regions.
 sync.catalog: .venv
 	@echo '==> Sync catalog from R2 into local data/ folder (~10gb)'
-	rclone sync owid-r2:owid-catalog/ data/ --verbose --fast-list --transfers=64 --checkers=64 --include "/meadow/**" --include "/garden/**" --include "/open_numbers/**"
+	rclone copy owid-r2:owid-catalog/ data/ --verbose --fast-list --transfers=64 --checkers=64 --include "/meadow/**" --include "/garden/**" --include "/open_numbers/**"
 
 grapher: .venv
 	@echo '==> Running full etl with grapher upsert'

From d530598904272ffda8d7b27b7f116db499106d00 Mon Sep 17 00:00:00 2001
From: Pablo Arriagada <63430031+paarriagadap@users.noreply.github.com>
Date: Wed, 17 Apr 2024 10:39:10 -0400
Subject: [PATCH 58/61] Add World excluding China and India on PIP (#2528)

---
 .../2024-03-27/world_bank_pip.countries.json  |   4 +-
 .../garden/wb/2024-03-27/world_bank_pip.py    |  37 +++--
 snapshots/wb/2024-03-27/pip_api.py            | 130 ++++++++++++++++--
 .../wb/2024-03-27/world_bank_pip.csv.dvc      |   4 +-
 .../world_bank_pip_percentiles.csv.dvc        |   4 +-
 5 files changed, 153 insertions(+), 26 deletions(-)

diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
index 73342a8a395..40913a0fe03 100644
--- a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
+++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json
@@ -177,5 +177,7 @@
   "Sub-Saharan Africa": "Sub-Saharan Africa (PIP)",
   "Taiwan, China": "Taiwan",
   "Turkiye": "Turkey",
-  "Western and Central Africa": "Western and Central Africa (PIP)"
+  "Western and Central Africa": "Western and Central Africa (PIP)",
+  "World (excluding China)": "World (excluding China)",
+  "World (excluding India)": "World (excluding India)"
 }
\ No newline at end of file
diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
index 65564efb8b7..a7fa8d090ff 100644
--- a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
+++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py
@@ -29,13 +29,13 @@
 
 # Define absolute poverty lines used depending on PPP version
 # NOTE: Modify if poverty lines are updated from source
-povlines_dict = {
+POVLINES_DICT = {
     2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000],
     2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000],
 }
 
 # Define regions in the dataset
-regions_list = [
+REGIONS_LIST = [
     "East Asia and Pacific (PIP)",
     "Eastern and Southern Africa (PIP)",
     "Europe and Central Asia (PIP)",
@@ -46,6 +46,8 @@
     "Sub-Saharan Africa (PIP)",
     "Western and Central Africa (PIP)",
     "World",
+    "World (excluding China)",
+    "World (excluding India)",
 ]
 
 # Set table format when printing
@@ -78,8 +80,8 @@ def run(dest_dir: str) -> None:
     tb_percentiles: Table = geo.harmonize_countries(df=tb_percentiles, countries_file=paths.country_mapping_path)
 
     # Show regional data from 1990 onwards
-    tb = regional_data_from_1990(tb, regions_list)
-    tb_percentiles = regional_data_from_1990(tb_percentiles, regions_list)
+    tb = regional_data_from_1990(tb, REGIONS_LIST)
+    tb_percentiles = regional_data_from_1990(tb_percentiles, REGIONS_LIST)
 
     # Amend the entity to reflect if data refers to urban or rural only
     tb = identify_rural_urban(tb)
@@ -90,18 +92,18 @@ def run(dest_dir: str) -> None:
 
     # Create stacked variables from headcount and headcount_ratio
     tb_2011, col_stacked_n_2011, col_stacked_pct_2011 = create_stacked_variables(
-        tb_2011, povlines_dict, ppp_version=2011
+        tb_2011, POVLINES_DICT, ppp_version=2011
     )
     tb_2017, col_stacked_n_2017, col_stacked_pct_2017 = create_stacked_variables(
-        tb_2017, povlines_dict, ppp_version=2017
+        tb_2017, POVLINES_DICT, ppp_version=2017
     )
 
     # Sanity checks. I don't run for percentile tables because that process was done in the extraction
     tb_2011 = sanity_checks(
-        tb_2011, povlines_dict, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
+        tb_2011, POVLINES_DICT, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011
     )
     tb_2017 = sanity_checks(
-        tb_2017, povlines_dict, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
+        tb_2017, POVLINES_DICT, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017
     )
 
     # Separate out consumption-only, income-only. Also, create a table with both income and consumption
@@ -582,7 +584,9 @@ def sanity_checks(
     cols_to_check = (
         col_headcount + col_headcount_ratio + col_povertygap + col_tot_shortfall + col_stacked_n + col_stacked_pct
     )
-    mask = tb[cols_to_check].isna().any(axis=1)
+    mask = (tb[cols_to_check].isna().any(axis=1)) & (
+        ~tb["country"].isin(["World (excluding China)", "World (excluding India)"])
+    )
     tb_error = tb[mask].reset_index(drop=True).copy()
 
     if not tb_error.empty:
@@ -781,7 +785,14 @@ def regional_headcount(tb: Table) -> Table:
 
     # Remove Western and Central and Eastern and Southern Africa. It's redundant with Sub-Saharan Africa (PIP)
     tb_regions = tb_regions[
-        ~tb_regions["country"].isin(["Western and Central Africa (PIP)", "Eastern and Southern Africa (PIP)"])
+        ~tb_regions["country"].isin(
+            [
+                "Western and Central Africa (PIP)",
+                "Eastern and Southern Africa (PIP)",
+                "World (excluding China)",
+                "World (excluding India)",
+            ]
+        )
     ].reset_index(drop=True)
 
     # Select needed columns and pivot
@@ -847,7 +858,7 @@ def survey_count(tb: Table) -> Table:
     Create survey count indicator, by counting the number of surveys available for each country in the past decade
     """
     # Remove regions from the table
-    tb_survey = tb[~tb["reporting_level"].isnull()].reset_index(drop=True).copy()
+    tb_survey = tb[~tb["country"].isin(REGIONS_LIST)].reset_index(drop=True).copy()
 
     min_year = int(tb_survey["year"].min())
     max_year = int(tb_survey["year"].max())
@@ -885,7 +896,7 @@ def survey_count(tb: Table) -> Table:
     tb_survey = tb_survey[["country", "year", "surveys_past_decade"]]
 
     # Merge with original table
-    tb = pr.merge(tb_survey, tb, on=["country", "year"], how="left")
+    tb = pr.merge(tb_survey, tb, on=["country", "year"], how="outer")
 
     return tb
 
@@ -1043,7 +1054,7 @@ def define_columns_for_ppp_comparison(tb: Table, id_cols: list, ppp_version: int
 
     tb = tb.reset_index()
     # Define poverty lines
-    povlines_list = povlines_dict[ppp_version]
+    povlines_list = POVLINES_DICT[ppp_version]
 
     # Define groups of columns
     headcount_absolute_cols = [f"headcount_{p}" for p in povlines_list]
diff --git a/snapshots/wb/2024-03-27/pip_api.py b/snapshots/wb/2024-03-27/pip_api.py
index 8b4d17d1c94..e7ce69b2d44 100644
--- a/snapshots/wb/2024-03-27/pip_api.py
+++ b/snapshots/wb/2024-03-27/pip_api.py
@@ -13,7 +13,7 @@
 To run this code from scratch,
     - Connect to the staging server of this pull request:
         - Hit Cmd + Shift + P and select Remote-SSH: Connect to Host
-        - Type in owid@staging-site-{pull_request_name}
+        - Type in owid@staging-site-{branch_name}
     - Delete the files in the cache folder:
         rm -rf .cache/*
     - Check if you need to update the poverty lines in the functions `poverty_lines_countries` and `poverty_lines_regions`. Run
@@ -1416,6 +1416,41 @@ def concurrent_region_function():
 
         return results
 
+    def get_china_india_data_filled(povline, ppp_version, versions):
+        """
+        This function extracts filled data for China and India to be used in the key indicators file.
+        """
+        return pip_query_country(
+            wb_api,
+            popshare_or_povline="povline",
+            value=povline / 100,
+            versions=versions,
+            country_code="CHN&country=IND",
+            year="all",
+            fill_gaps="true",
+            welfare_type="all",
+            reporting_level="national",
+            ppp_version=ppp_version,
+            download="false",
+        )
+
+    def concurrent_function_china_india():
+        """
+        This function makes concurrency work for China and India data.
+        """
+        with ThreadPool(MAX_WORKERS) as pool:
+            tasks = [
+                (povline, ppp_version, versions)
+                for ppp_version, povlines in POVLINES_DICT.items()
+                for povline in povlines
+            ]
+            results = pool.starmap(get_china_india_data_filled, tasks)
+
+        # Concatenate list of dataframes
+        results = pd.concat(results, ignore_index=True)
+
+        return results
+
     # Obtain latest versions of the PIP dataset
     versions = pip_versions(wb_api)
 
@@ -1423,20 +1458,29 @@ def concurrent_region_function():
     results = concurrent_function()
     results_region = concurrent_region_function()
 
+    # Query China and India data
+    results_china_india = concurrent_function_china_india()
+
+    # Calculate World (excluding China) and World (excluding India) data
+    results_region = calculate_world_excluding_china_and_india(results_region, results_china_india)
+
     # If country is nan but country_code is TWN, replace country with Taiwan, China
     results.loc[results["country"].isnull() & (results["country_code"] == "TWN"), "country"] = "Taiwan, China"
 
     # I check if the set of countries is the same in the df and in the aux table (list of countries)
     aux_dict = pip_aux_tables(wb_api, table="countries")
-    assert set(results["country"].unique()) == set(aux_dict["countries"]["country_name"].unique()), log.fatal(
-        f"List of countries is not the same! Differences: {set(results['country'].unique()) - set(aux_dict['countries']['country_name'].unique())}"
+    assert set(results["country"]) == set(aux_dict["countries"]["country_name"]), log.fatal(
+        f"List of countries is not the same! Differences: {set(results['country']) - set(aux_dict['countries']['country_name'])}"
     )
 
-    # I check if the set of regions is the same in the df and in the aux table (list of regions)
-    aux_dict = pip_aux_tables(wb_api, table="regions")
-    assert set(results_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal(
-        f"List of regions is not the same! Differences: {set(results_region['country'].unique()) - set(aux_dict['regions']['region'].unique())}"
-    )
+    # # I check if the set of regions is the same in the df and in the aux table (list of regions) + World (excluding China) + World (excluding India)
+    # aux_dict = pip_aux_tables(wb_api, table="regions")
+
+    # countries_to_check = set(aux_dict["regions"]["region"]) | {"World (excluding China)", "World (excluding India)"}
+
+    # assert set(results_region["country"]) == (countries_to_check), log.fatal(
+    #     f"List of regions is not the same! Differences: {set(results_region['country']) - countries_to_check}"
+    # )
 
     # Concatenate df_country and df_region
     df = pd.concat([results, results_region], ignore_index=True)
@@ -1454,6 +1498,76 @@ def concurrent_region_function():
     return df
 
 
+def calculate_world_excluding_china_and_india(results_region: pd.DataFrame, results_china_india: pd.DataFrame):
+    """
+    Calculate World (excluding China) and World (excluding India) data.
+    """
+
+    results_region = results_region.copy()
+    results_china_india = results_china_india.copy()
+
+    # Filter results to show only World
+    results_world = results_region[results_region["country"] == "World"].copy().reset_index(drop=True)
+
+    # Keep country, year, poverty_line and headcount columns
+    results_world = results_world[["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]]
+    results_china_india = results_china_india[
+        ["ppp_version", "country", "year", "poverty_line", "headcount", "reporting_pop"]
+    ]
+
+    # Create headcount_ratio column
+    results_world["headcount_number"] = results_world["headcount"] * results_world["reporting_pop"]
+    results_china_india["headcount_number"] = results_china_india["headcount"] * results_china_india["reporting_pop"]
+
+    # Make these columns integer
+    results_world["headcount_number"] = results_world["headcount_number"].astype(int)
+    results_china_india["headcount_number"] = results_china_india["headcount_number"].astype(int)
+
+    # Merge results_world and results_china_india
+    results_excluding = pd.merge(
+        results_china_india,
+        results_world,
+        on=["ppp_version", "year", "poverty_line"],
+        how="left",
+        suffixes=("", "_world"),
+    )
+
+    # Calculate headcount_excluding as the difference between headcount_world and headcount
+    results_excluding["headcount_number_excluding"] = (
+        results_excluding["headcount_number_world"] - results_excluding["headcount_number"]
+    )
+
+    # Same with reporting_pop
+    results_excluding["reporting_pop_excluding"] = (
+        results_excluding["reporting_pop_world"] - results_excluding["reporting_pop"]
+    )
+
+    # Estimate headcount_excluding
+    results_excluding["headcount_excluding"] = (
+        results_excluding["headcount_number_excluding"] / results_excluding["reporting_pop_excluding"]
+    )
+
+    # Keep country, year , poverty_line, headcount_excluding and reporting_pop_excluding columns
+    results_excluding = results_excluding[
+        ["ppp_version", "country", "year", "poverty_line", "headcount_excluding", "reporting_pop_excluding"]
+    ]
+
+    # Rename countries to World (excluding China) and World (excluding India)
+    results_excluding["country"] = results_excluding["country"].replace(
+        {"China": "World (excluding China)", "India": "World (excluding India)"}
+    )
+
+    # Rename columns to headcount and reporting_pop
+    results_excluding = results_excluding.rename(
+        columns={"headcount_excluding": "headcount", "reporting_pop_excluding": "reporting_pop"}
+    )
+
+    # Concatenate tables
+    results_region = pd.concat([results_region, results_excluding], ignore_index=True)
+
+    return results_region
+
+
 def median_patch(df, country_or_region):
     """
     Patch missing values in the median column.
diff --git a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
index 4e5434ee522..e2af7a3f61c 100644
--- a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
+++ b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc
@@ -26,6 +26,6 @@ meta:
 
 wdir: ../../../data/snapshots/wb/2024-01-17
 outs:
-  - md5: 5fb032d2de430f79f25e1bdf1259c9bf
-    size: 35764784
+  - md5: 89a74ce0a636f6b0e317664b99eebd51
+    size: 35912832
     path: world_bank_pip.csv
diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
index d7c1982d021..5512e88a66d 100644
--- a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
+++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc
@@ -28,6 +28,6 @@ meta:
 
 wdir: ../../../data/snapshots/wb/2024-01-17
 outs:
-  - md5: f5bb53372a6fd0f563d20d04b3c897c7
-    size: 49972432
+  - md5: 87ff2bcc5473da45f0c2f2a6837bef98
+    size: 49910607
     path: world_bank_pip_percentiles.csv

From 890345017654863a799485db13a8d1a54d5be232 Mon Sep 17 00:00:00 2001
From: Mojmir Vinkler <mojmir.vinkler@gmail.com>
Date: Wed, 17 Apr 2024 16:45:32 +0200
Subject: [PATCH 59/61] :bug: cache engine to avoid creating too many
 connections (#2537)

---
 etl/db.py             | 22 ++++++++++++----------
 etl/grapher_import.py |  5 +++--
 etl/grapher_model.py  | 10 ----------
 3 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/etl/db.py b/etl/db.py
index cf07e1edc05..aa0d21da2b5 100644
--- a/etl/db.py
+++ b/etl/db.py
@@ -1,5 +1,6 @@
+import functools
 import warnings
-from typing import Any, Dict, List, Optional, cast
+from typing import Any, Dict, List, Optional
 from urllib.parse import quote
 
 import MySQLdb
@@ -42,17 +43,18 @@ def get_session(**kwargs) -> Session:
     return Session(get_engine(**kwargs))
 
 
+@functools.cache
+def _get_engine_cached(cf: Any) -> Engine:
+    return create_engine(
+        f"mysql://{cf.DB_USER}:{quote(cf.DB_PASS)}@{cf.DB_HOST}:{cf.DB_PORT}/{cf.DB_NAME}",
+        pool_size=30,  # Increase the pool size to allow higher GRAPHER_WORKERS
+        max_overflow=30,  # Increase the max overflow limit to allow higher GRAPHER_WORKERS
+    )
+
+
 def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine:
     cf: Any = dict_to_object(conf) if conf else config
-
-    return cast(
-        Engine,
-        create_engine(
-            f"mysql://{cf.DB_USER}:{quote(cf.DB_PASS)}@{cf.DB_HOST}:{cf.DB_PORT}/{cf.DB_NAME}",
-            pool_size=30,  # Increase the pool size to allow higher GRAPHER_WORKERS
-            max_overflow=30,  # Increase the max overflow limit to allow higher GRAPHER_WORKERS
-        ),
-    )
+    return _get_engine_cached(cf)
 
 
 def get_dataset_id(
diff --git a/etl/grapher_import.py b/etl/grapher_import.py
index fe4c4fa82a1..5062b30c652 100644
--- a/etl/grapher_import.py
+++ b/etl/grapher_import.py
@@ -30,6 +30,7 @@
 )
 from apps.backport.datasync.datasync import upload_gzip_dict
 from etl import config
+from etl.db import get_engine
 
 from . import grapher_helpers as gh
 from . import grapher_model as gm
@@ -333,7 +334,7 @@ def fetch_db_checksum(dataset: catalog.Dataset) -> Optional[str]:
     assert dataset.metadata.version, "Dataset must have a version"
     assert dataset.metadata.namespace, "Dataset must have a namespace"
 
-    with Session(gm.get_engine()) as session:
+    with Session(get_engine()) as session:
         q = select(gm.Dataset).where(
             gm.Dataset.shortName == dataset.metadata.short_name,
             gm.Dataset.version == dataset.metadata.version,
@@ -344,7 +345,7 @@ def fetch_db_checksum(dataset: catalog.Dataset) -> Optional[str]:
 
 
 def set_dataset_checksum_and_editedAt(dataset_id: int, checksum: str) -> None:
-    with Session(gm.get_engine()) as session:
+    with Session(get_engine()) as session:
         q = (
             update(gm.Dataset)
             .where(gm.Dataset.id == dataset_id)
diff --git a/etl/grapher_model.py b/etl/grapher_model.py
index c6df22cebbf..1f777c807bd 100644
--- a/etl/grapher_model.py
+++ b/etl/grapher_model.py
@@ -8,7 +8,6 @@
 from datetime import date, datetime
 from pathlib import Path
 from typing import Annotated, Any, Dict, List, Literal, Optional, TypedDict, Union, get_args
-from urllib.parse import quote
 
 import humps
 import pandas as pd
@@ -35,7 +34,6 @@
     VARCHAR,
 )
 from sqlalchemy.exc import NoResultFound
-from sqlalchemy.future import Engine as _FutureEngine
 from sqlmodel import JSON as _JSON
 from sqlmodel import (
     Column,
@@ -43,7 +41,6 @@
     Relationship,
     Session,
     SQLModel,
-    create_engine,
     or_,
     select,
 )
@@ -68,13 +65,6 @@
 JSON = _JSON(none_as_null=True)
 
 
-def get_engine() -> _FutureEngine:
-    return create_engine(
-        f"mysql://{config.DB_USER}:{quote(config.DB_PASS)}@{config.DB_HOST}:{config.DB_PORT}/{config.DB_NAME}",
-        future=False,
-    )
-
-
 t_active_datasets = Table(
     "active_datasets",
     metadata,

From 0d91e7c4d816b3af111b8ab23eb25d5c456661e2 Mon Sep 17 00:00:00 2001
From: owidbot <tech@ourworldindata.org>
Date: Wed, 17 Apr 2024 15:26:58 +0000
Subject: [PATCH 60/61] fasttrack:
 fasttrack/latest/usa_weather_climate_noaa.csv

---
 .../latest/usa_weather_climate_noaa.meta.yml  | 41 +++----------------
 .../latest/usa_weather_climate_noaa.csv.dvc   |  4 +-
 2 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/etl/steps/data/grapher/fasttrack/latest/usa_weather_climate_noaa.meta.yml b/etl/steps/data/grapher/fasttrack/latest/usa_weather_climate_noaa.meta.yml
index 15ccd45b8d6..0213c74fbfa 100644
--- a/etl/steps/data/grapher/fasttrack/latest/usa_weather_climate_noaa.meta.yml
+++ b/etl/steps/data/grapher/fasttrack/latest/usa_weather_climate_noaa.meta.yml
@@ -109,43 +109,14 @@ tables:
           conversionFactor: 100.0
         description: |-
           Unusually cold winter temperatures are measured based on daily minimum temperatures. At each station, the recorded lowsare compared with the full set of historical records. After averaging over a particular month or season of interest, the coldest10 percent of years are defined as “unusually cold.”
-      winter_temp_anomaly:
-        title: Winter
+      seasonal_temp_anomaly:
+        title: Temperature anomaly
         unit: °F
         short_unit: °F
-        description: Temperature anomaly in winter, compared to the 1901–2000 average. "Winter" includes December to February.
-      spring_temp_anomaly:
-        title: Spring
-        unit: °F
-        short_unit: °F
-        description: Temperature anomaly in spring, compared to the 1901–2000 average. "Spring" includes March to May.
-      summer_temp_anomaly:
-        title: Summer
-        unit: °F
-        short_unit: °F
-        description: Temperature anomaly in summer, compared to the 1901–2000 average. "Summer" includes June to August.
-      fall_temp_anomaly:
-        title: Fall
-        unit: °F
-        short_unit: °F
-        description: Temperature anomaly in fall, compared to the 1901–2000 average. "Fall" includes September to November.
-      winter_temp_anomaly_rolling:
-        title: Winter (rolling average)
-        unit: °F
-        short_unit: °F
-        description: Temperature anomaly in winter, compared to the 1901–2000 average. "Winter" includes December to February.
-      spring_temp_anomaly_rolling:
-        title: Spring (rolling average)
+        description: |-
+          Temperature anomaly in different seasons, compared to the 1901–2000 average. "Winter" includes December to February. "Spring" includes March to May. "Summer" includes June to August. "Fall" includes September to November.
+      seasonal_temp_anomaly_rolling:
+        title: Temperature anomaly (smoothed)
         unit: °F
         short_unit: °F
         description: Temperature anomaly in spring, compared to the 1901–2000 average. "Spring" includes March to May.
-      fall_temp_anomaly_rolling:
-        title: Summer (rolling average)
-        unit: °F
-        short_unit: °F
-        description: Temperature anomaly in summer, compared to the 1901–2000 average. "Summer" includes June to August.
-      summer_temp_anomaly_rolling:
-        title: Fall (rolling average)
-        unit: °F
-        short_unit: °F
-        description: Temperature anomaly in fall, compared to the 1901–2000 average. "Fall" includes September to November.
diff --git a/snapshots/fasttrack/latest/usa_weather_climate_noaa.csv.dvc b/snapshots/fasttrack/latest/usa_weather_climate_noaa.csv.dvc
index 6d60b92b88c..2f52ffb5923 100644
--- a/snapshots/fasttrack/latest/usa_weather_climate_noaa.csv.dvc
+++ b/snapshots/fasttrack/latest/usa_weather_climate_noaa.csv.dvc
@@ -12,6 +12,6 @@ meta:
   description: Weather and climate indicators such as temperature, precipitation, and droughts in the United States.
   license: {}
 outs:
-  - md5: 50e5719f78dc2b542d5cb445b993bd9c
-    size: 24436
+  - md5: d72f3b7e95e7ad4f50fff92609d57cb2
+    size: 38524
     path: usa_weather_climate_noaa.csv

From 05a63cf0b3e59c2d7a8a672e68701c3185a07983 Mon Sep 17 00:00:00 2001
From: Marigold <mojmir.vinkler@gmail.com>
Date: Wed, 17 Apr 2024 22:14:42 +0200
Subject: [PATCH 61/61] :bug: memoize engine per process

---
 etl/command.py | 12 +++++++++++-
 etl/db.py      |  7 +++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/etl/command.py b/etl/command.py
index efc234a57cd..5aa7d184ecc 100644
--- a/etl/command.py
+++ b/etl/command.py
@@ -449,12 +449,22 @@ def exec_graph_parallel(
         # Dictionary to keep track of future tasks
         future_to_task: Dict[Future, str] = {}
 
+        ready_tasks = []
+
         while topological_sorter.is_active():
+            # add new tasks
+            ready_tasks += topological_sorter.get_ready()
+
             # Submit tasks that are ready to the executor
-            for task in topological_sorter.get_ready():
+            # NOTE: limit it to `workers`, otherwise it might accept tasks that are not CPU bound
+            # and overload our DB
+            for task in ready_tasks[:workers]:
                 future = executor.submit(func, task, **kwargs)
                 future_to_task[future] = task
 
+            # remove ready tasks
+            ready_tasks = ready_tasks[workers:]
+
             # Wait for at least one future to complete
             done, _ = wait(future_to_task.keys(), return_when=FIRST_COMPLETED)
 
diff --git a/etl/db.py b/etl/db.py
index aa0d21da2b5..6f8c42cc706 100644
--- a/etl/db.py
+++ b/etl/db.py
@@ -1,4 +1,5 @@
 import functools
+import os
 import warnings
 from typing import Any, Dict, List, Optional
 from urllib.parse import quote
@@ -44,7 +45,7 @@ def get_session(**kwargs) -> Session:
 
 
 @functools.cache
-def _get_engine_cached(cf: Any) -> Engine:
+def _get_engine_cached(cf: Any, pid: int) -> Engine:
     return create_engine(
         f"mysql://{cf.DB_USER}:{quote(cf.DB_PASS)}@{cf.DB_HOST}:{cf.DB_PORT}/{cf.DB_NAME}",
         pool_size=30,  # Increase the pool size to allow higher GRAPHER_WORKERS
@@ -54,7 +55,9 @@ def _get_engine_cached(cf: Any) -> Engine:
 
 def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine:
     cf: Any = dict_to_object(conf) if conf else config
-    return _get_engine_cached(cf)
+    # pid in memoization makes sure every process gets its own Engine
+    pid = os.getpid()
+    return _get_engine_cached(cf, pid)
 
 
 def get_dataset_id(