From 52af71b2f6c52accc6728326ae693281be3013c0 Mon Sep 17 00:00:00 2001 From: owidbot Date: Thu, 11 Apr 2024 15:57:31 +0000 Subject: [PATCH 01/40] :robot: Metadata update by Admin --- etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml diff --git a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml new file mode 100644 index 00000000000..282a3c521ac --- /dev/null +++ b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml @@ -0,0 +1,6 @@ +tables: + prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct: + variables: + ? |- + prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct__sex_both_sexes__age_group_18plus__years + : title: Share of adults who are overweight (age-standardized) From 9b367c7b4d7e843827e07573649a625dba102dfc Mon Sep 17 00:00:00 2001 From: owidbot Date: Thu, 11 Apr 2024 16:01:24 +0000 Subject: [PATCH 02/40] fasttrack: fasttrack/latest/gpei.csv --- dag/fasttrack.yml | 2 ++ .../grapher/fasttrack/latest/gpei.meta.yml | 33 +++++++++++++++++++ .../data/grapher/fasttrack/latest/gpei.py | 22 +++++++++++++ snapshots/fasttrack/latest/gpei.csv.dvc | 24 ++++++++++++++ 4 files changed, 81 insertions(+) create mode 100644 etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml create mode 100644 etl/steps/data/grapher/fasttrack/latest/gpei.py create mode 100644 snapshots/fasttrack/latest/gpei.csv.dvc diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml index 7b0596c8741..9ad5e3fc844 100644 --- a/dag/fasttrack.yml +++ b/dag/fasttrack.yml @@ -154,3 +154,5 @@ steps: - snapshot://fasttrack/latest/usa_weather_climate_noaa.csv data://grapher/fasttrack/latest/global_precipitation_anomaly_noaa: - snapshot://fasttrack/latest/global_precipitation_anomaly_noaa.csv + data://grapher/fasttrack/latest/gpei: + - snapshot://fasttrack/latest/gpei.csv diff --git a/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml b/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml new file mode 100644 index 00000000000..4c1574c2d36 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml @@ -0,0 +1,33 @@ +dataset: + title: Global Polio Eradication Initiative - cVDPV + description: '' + licenses: + - name: CC BY-NC-SA 3.0 IGO + url: https://polioeradication.org/terms-of-use/ +tables: + gpei: + variables: + cvdpv1: + title: circulating_vaccine_derived_polio_virus_1 + unit: cases + display: + numDecimalPlaces: 0 + description: Cases of circulating vaccine derived polio virus, type 1. + cvdpv2: + title: circulating_vaccine_derived_polio_virus_2 + unit: cases + display: + numDecimalPlaces: 0 + description: Cases of circulating vaccine derived polio virus, type 2. + cvdpv3: + title: circulating_vaccine_derived_polio_virus_3 + unit: cases + display: + numDecimalPlaces: 0 + description: Cases of circulating vaccine derived polio virus, type 3. + total_cvdpv: + title: circulating_vaccine_derived_polio_virus_total + unit: cases + display: + numDecimalPlaces: 0 + description: Cases of circulating vaccine derived polio virus, all types. diff --git a/etl/steps/data/grapher/fasttrack/latest/gpei.py b/etl/steps/data/grapher/fasttrack/latest/gpei.py new file mode 100644 index 00000000000..0b9f8ef1fbf --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/gpei.py @@ -0,0 +1,22 @@ +from etl.helpers import PathFinder, create_dataset, get_metadata_path +from etl.snapshot import Snapshot + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # load snapshot + snap = Snapshot("fasttrack/latest/gpei.csv") + + # load data + tb = snap.read_csv() + + # add table, update metadata from *.meta.yml and save + ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + + ds.save() diff --git a/snapshots/fasttrack/latest/gpei.csv.dvc b/snapshots/fasttrack/latest/gpei.csv.dvc new file mode 100644 index 00000000000..15c64b743c4 --- /dev/null +++ b/snapshots/fasttrack/latest/gpei.csv.dvc @@ -0,0 +1,24 @@ +meta: + origin: + producer: Global Polio Eradication Initiative + title: Circulating Vaccine Derived Polio Virus + attribution: Global Polio Eradication Initiative + attribution_short: GPEI + version_producer: Google Sheet + url_main: |- + https://polioeradication.org/wp-content/uploads/2024/04/weekly-polio-analyses-cVDPV-20240402.pdf; https://polioeradication.org/wp-content/uploads/2022/04/weekly-polio-analyses-cVDPV-20220405.pdf + url_download: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vRxoU5EL03HvNZKmbDLCNDR8ZhOqd3C6guk9cVIX8uuUnXtMj2Do6mKUo4xhPO6q8KMw2At5ts05T4R/pub?output=csv + date_accessed: '2024-04-11' + license: + name: https://polioeradication.org/terms-of-use/ + url: CC BY-NC-SA 3.0 IGO + name: Global Polio Eradication Initiative - cVDPV + description: '' + license: + name: CC BY-NC-SA 3.0 IGO + url: https://polioeradication.org/terms-of-use/ +outs: + - md5: 4631f6835f642e97800fce7c2250fdd0 + size: 2840 + path: gpei.csv From 6b87ebf02ecfe4f27ec731fdc3bce995e7971fc1 Mon Sep 17 00:00:00 2001 From: Marigold Date: Thu, 11 Apr 2024 22:46:59 +0200 Subject: [PATCH 03/40] :bug: hotfix metadata --- etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml index 282a3c521ac..d73471859f7 100644 --- a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml +++ b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml @@ -2,5 +2,5 @@ tables: prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct: variables: ? |- - prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct__sex_both_sexes__age_group_18plus__years + prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct : title: Share of adults who are overweight (age-standardized) From 7bb3c2b862bcbe4b991da86cee5dd01598e791bf Mon Sep 17 00:00:00 2001 From: owidbot Date: Fri, 12 Apr 2024 04:03:42 +0000 Subject: [PATCH 04/40] :robot: automatic wildfires update --- snapshots/climate/latest/weekly_wildfires.csv.dvc | 4 ++-- snapshots/excess_mortality/latest/hmd_stmf.csv.dvc | 2 +- snapshots/excess_mortality/latest/wmd.csv.dvc | 2 +- snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc | 2 +- .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 73b92d25225..80dfa615e25 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -9,8 +9,8 @@ meta: citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-04-11 - date_published: 2024-04-11 + date_accessed: 2024-04-12 + date_published: 2024-04-12 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index b36f25da684..56571ce4d93 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,7 +13,7 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-04-11 + date_accessed: 2024-04-12 publication_date: 2024-03-18 publication_year: 2024 published_by: |- diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index 9b63532ccee..0e5f2f3fdfd 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-04-11 + date_accessed: 2024-04-12 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 581bd5f1206..8e76a8ebda8 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-04-11 + date_accessed: 2024-04-12 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index d52caeb9c77..fc1f5119059 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-04-11 + date_accessed: 2024-04-12 publication_date: '2021-06-30' publication_year: 2021 published_by: |- From cb4aff2db5106201680ea9a4940fa2fe5bc42e6d Mon Sep 17 00:00:00 2001 From: owidbot Date: Fri, 12 Apr 2024 04:05:37 +0000 Subject: [PATCH 05/40] :robot: automatic flunet update --- snapshots/who/latest/fluid.csv.dvc | 4 ++-- snapshots/who/latest/flunet.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index a1752d6a8ea..9eae4761e07 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: aa9466d41960d48a594b05b7153b50f0 - size: 150634788 + - md5: 3cc56ba930e8b8c6c383b9f227d04b66 + size: 150642152 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index 10d86c6a41c..c2fd0d30b67 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: 2f87106c852ce3bd068bf8f8e6771d7e - size: 25764329 + - md5: 84662aad1845ce4641ffdf4d664af262 + size: 25764774 path: flunet.csv From b1220e611ecdc28742e0b4077054847f7c023ac5 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Fri, 12 Apr 2024 10:04:22 +0200 Subject: [PATCH 06/40] :sparkles: improve datadiff posted by owidbot (#2507) * :sparkles: improve datadiff posted by owidbot * add include * prune equal datasets and add info about staging servers * Fix regex pattern in etldiff.py and handle exceptions in datadiff.py --- apps/owidbot/etldiff.py | 41 ++++++++++++++++++++++++++++++++++++++--- etl/datadiff.py | 2 +- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py index e72652b5fce..261feb56da0 100644 --- a/apps/owidbot/etldiff.py +++ b/apps/owidbot/etldiff.py @@ -1,4 +1,5 @@ import datetime as dt +import re import subprocess from typing import Tuple @@ -9,6 +10,7 @@ from rich.ansi import AnsiDecoder from rich_click.rich_command import RichCommand +from apps.staging_sync.cli import _normalise_branch from etl import config from etl.paths import BASE_DIR @@ -23,6 +25,12 @@ "--branch", type=str, ) +@click.option( + "--include", + type=str, + default="garden", + help="Include datasets matching this regex.", +) @click.option( "--dry-run/--no-dry-run", default=False, @@ -31,6 +39,7 @@ ) def cli( branch: str, + include: str, dry_run: bool, ) -> None: """Post result of `etl diff` to Github PR. @@ -41,12 +50,24 @@ def cli( $ python apps/owidbot/etldiff.py --branch my-branch ``` """ - lines = call_etl_diff() + lines = call_etl_diff(include) diff, result = format_etl_diff(lines) + nbranch = _normalise_branch(branch) if branch else "dry-run" + body = f"""
+Staging server: + +- **Admin**: http://staging-site-{nbranch}/admin/login +- **Site**: http://staging-site-{nbranch}/ +- **Login**: `ssh owid@staging-site-{nbranch}` +- **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch} +
+ +
+ etl diff: {result} ```diff @@ -117,10 +138,24 @@ def format_etl_diff(lines: list[str]) -> Tuple[str, str]: new_lines.append(line) diff = "\n".join(new_lines) + + # Some datasets might have different checksum, but be the same (this is caused by checksum_input and checksum_output + # problem). Hotfix this by removing matching datasets from the output. + # Example: + # = Dataset meadow/agriculture/2024-03-26/attainable_yields + # = Table attainable_yields + # = Dataset garden/agriculture/2024-03-26/attainable_yields + # = Table attainable_yields + # ~ Column A + # = Dataset grapher/agriculture/2024-03-26/attainable_yields + # = Table attainable_yields + pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)" + diff = re.sub(pattern, "", diff) + return diff, result -def call_etl_diff() -> list[str]: +def call_etl_diff(include: str) -> list[str]: cmd = [ "poetry", "run", @@ -129,7 +164,7 @@ def call_etl_diff() -> list[str]: "REMOTE", "data/", "--include", - "garden", + include, "--exclude", EXCLUDE_DATASETS, "--verbose", diff --git a/etl/datadiff.py b/etl/datadiff.py index b7eaa5e97f9..9820496d40f 100644 --- a/etl/datadiff.py +++ b/etl/datadiff.py @@ -451,7 +451,7 @@ def _append_and_print(x): continue except Exception as e: # soft fail and continue with another dataset - log.error(e, exc_info=True) + log.error("\n".join(traceback.format_exception(type(e), e, e.__traceback__))) any_error = True continue From ac3fa818832c2e3e1a0be922e6963e58ae1fedce Mon Sep 17 00:00:00 2001 From: owidbot Date: Fri, 12 Apr 2024 08:15:02 +0000 Subject: [PATCH 07/40] fasttrack: fasttrack/latest/gpei.csv --- etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml | 8 ++++---- snapshots/fasttrack/latest/gpei.csv.dvc | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml b/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml index 4c1574c2d36..8a8258e235b 100644 --- a/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml +++ b/etl/steps/data/grapher/fasttrack/latest/gpei.meta.yml @@ -8,25 +8,25 @@ tables: gpei: variables: cvdpv1: - title: circulating_vaccine_derived_polio_virus_1 + title: cVDPV1 unit: cases display: numDecimalPlaces: 0 description: Cases of circulating vaccine derived polio virus, type 1. cvdpv2: - title: circulating_vaccine_derived_polio_virus_2 + title: cVDPV2 unit: cases display: numDecimalPlaces: 0 description: Cases of circulating vaccine derived polio virus, type 2. cvdpv3: - title: circulating_vaccine_derived_polio_virus_3 + title: CVDPV3 unit: cases display: numDecimalPlaces: 0 description: Cases of circulating vaccine derived polio virus, type 3. total_cvdpv: - title: circulating_vaccine_derived_polio_virus_total + title: cVDPV total unit: cases display: numDecimalPlaces: 0 diff --git a/snapshots/fasttrack/latest/gpei.csv.dvc b/snapshots/fasttrack/latest/gpei.csv.dvc index 15c64b743c4..c042660f475 100644 --- a/snapshots/fasttrack/latest/gpei.csv.dvc +++ b/snapshots/fasttrack/latest/gpei.csv.dvc @@ -9,7 +9,7 @@ meta: https://polioeradication.org/wp-content/uploads/2024/04/weekly-polio-analyses-cVDPV-20240402.pdf; https://polioeradication.org/wp-content/uploads/2022/04/weekly-polio-analyses-cVDPV-20220405.pdf url_download: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vRxoU5EL03HvNZKmbDLCNDR8ZhOqd3C6guk9cVIX8uuUnXtMj2Do6mKUo4xhPO6q8KMw2At5ts05T4R/pub?output=csv - date_accessed: '2024-04-11' + date_accessed: '2024-04-12' license: name: https://polioeradication.org/terms-of-use/ url: CC BY-NC-SA 3.0 IGO From 3087f5709b1348febbb6b205749f936956baf55a Mon Sep 17 00:00:00 2001 From: Marigold Date: Fri, 12 Apr 2024 11:17:38 +0200 Subject: [PATCH 08/40] :bug: handle indicators with dimensions correctly in ETL API --- api/v1/__init__.py | 6 +++++- etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml | 5 ++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/api/v1/__init__.py b/api/v1/__init__.py index a4443197f3b..a9877e90347 100644 --- a/api/v1/__init__.py +++ b/api/v1/__init__.py @@ -179,8 +179,12 @@ def _indicator_metadata_dict(indicator: Indicator, db_indicator: gm.Variable) -> indicator_update_dict = indicator.to_meta_dict() update_period_days = indicator_update_dict.pop("update_period_days", None) + # if indicator has dimensions, use its original name + original_short_name = (db_indicator.dimensions or {}).get("originalShortName") + short_name = original_short_name or db_indicator.shortName + # create dictionary for metadata - meta_dict = {"tables": {db_indicator.table_name: {"variables": {db_indicator.shortName: indicator_update_dict}}}} + meta_dict = {"tables": {db_indicator.table_name: {"variables": {short_name: indicator_update_dict}}}} if update_period_days: meta_dict["dataset"] = {"update_period_days": update_period_days} diff --git a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml index d73471859f7..8e93bdb0080 100644 --- a/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml +++ b/etl/steps/data/grapher/who/2024-01-03/gho.meta.override.yml @@ -1,6 +1,5 @@ tables: prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct: variables: - ? |- - prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct - : title: Share of adults who are overweight (age-standardized) + prevalence_of_overweight_among_adults__bmi__gt__25__age_standardized_estimate__pct: + title: Share of adults who are overweight (age-standardized) From 5d8bd5de9ebc71306ccea4280d8cd1db70d221a8 Mon Sep 17 00:00:00 2001 From: owidbot Date: Fri, 12 Apr 2024 09:59:19 +0000 Subject: [PATCH 09/40] fasttrack: fasttrack/latest/gpei.csv --- snapshots/fasttrack/latest/gpei.csv.dvc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshots/fasttrack/latest/gpei.csv.dvc b/snapshots/fasttrack/latest/gpei.csv.dvc index c042660f475..f64724cb0f7 100644 --- a/snapshots/fasttrack/latest/gpei.csv.dvc +++ b/snapshots/fasttrack/latest/gpei.csv.dvc @@ -19,6 +19,6 @@ meta: name: CC BY-NC-SA 3.0 IGO url: https://polioeradication.org/terms-of-use/ outs: - - md5: 4631f6835f642e97800fce7c2250fdd0 - size: 2840 + - md5: df37f065fdc2bc947d26ec1926f898d3 + size: 2966 path: gpei.csv From 5f899226bc07f7691fe0e426cb86e0fad109b1dd Mon Sep 17 00:00:00 2001 From: owidbot Date: Sat, 13 Apr 2024 04:03:43 +0000 Subject: [PATCH 10/40] :robot: automatic wildfires update --- snapshots/climate/latest/weekly_wildfires.csv.dvc | 4 ++-- snapshots/excess_mortality/latest/hmd_stmf.csv.dvc | 2 +- snapshots/excess_mortality/latest/wmd.csv.dvc | 2 +- snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc | 2 +- .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 80dfa615e25..6eab1c6ccc6 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -9,8 +9,8 @@ meta: citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-04-12 - date_published: 2024-04-12 + date_accessed: 2024-04-13 + date_published: 2024-04-13 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index 56571ce4d93..fa8aaf52d13 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,7 +13,7 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-04-12 + date_accessed: 2024-04-13 publication_date: 2024-03-18 publication_year: 2024 published_by: |- diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index 0e5f2f3fdfd..36018b29cb8 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-04-12 + date_accessed: 2024-04-13 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 8e76a8ebda8..0a792ecb64c 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-04-12 + date_accessed: 2024-04-13 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index fc1f5119059..e79268b4901 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-04-12 + date_accessed: 2024-04-13 publication_date: '2021-06-30' publication_year: 2021 published_by: |- From b62d8e3ce2bae0352a4b67e382aeadb49c00d949 Mon Sep 17 00:00:00 2001 From: owidbot Date: Sat, 13 Apr 2024 04:04:57 +0000 Subject: [PATCH 11/40] :robot: automatic flunet update --- snapshots/who/latest/fluid.csv.dvc | 4 ++-- snapshots/who/latest/flunet.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index 9eae4761e07..7b19b1ff4f0 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: 3cc56ba930e8b8c6c383b9f227d04b66 - size: 150642152 + - md5: afdb4e941ee982a63ac479aad6869d2f + size: 150755257 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index c2fd0d30b67..df5035ce3bf 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: 84662aad1845ce4641ffdf4d664af262 - size: 25764774 + - md5: 5f7fb645f2cb0cf75314ab1f1473a106 + size: 25784618 path: flunet.csv From 9415ceddb30517bdecf7b413bd6658f3997c46ac Mon Sep 17 00:00:00 2001 From: owidbot Date: Sun, 14 Apr 2024 04:03:41 +0000 Subject: [PATCH 12/40] :robot: automatic wildfires update --- snapshots/climate/latest/weekly_wildfires.csv.dvc | 8 ++++---- snapshots/excess_mortality/latest/hmd_stmf.csv.dvc | 2 +- snapshots/excess_mortality/latest/wmd.csv.dvc | 6 +++--- .../excess_mortality/latest/xm_karlinsky_kobak.csv.dvc | 2 +- .../latest/xm_karlinsky_kobak_ages.csv.dvc | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 6eab1c6ccc6..860918472f4 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -9,12 +9,12 @@ meta: citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-04-13 - date_published: 2024-04-13 + date_accessed: 2024-04-14 + date_published: 2024-04-14 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license outs: - - md5: 1bc963ac2662d95647d5d69942a1d416 - size: 11623135 + - md5: 06757d4e2324d884c119b0a8c419e896 + size: 11650883 path: weekly_wildfires.csv diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index fa8aaf52d13..c1816336a5f 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,7 +13,7 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-04-13 + date_accessed: 2024-04-14 publication_date: 2024-03-18 publication_year: 2024 published_by: |- diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index 36018b29cb8..8b61273d172 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-04-13 + date_accessed: 2024-04-14 publication_date: '2021-06-30' publication_year: 2021 published_by: |- @@ -33,6 +33,6 @@ meta: name: MIT License url: https://github.com/akarlinsky/world_mortality/blob/main/LICENSE outs: - - md5: c835bfe0bcc56774ea33176cfe4d5238 - size: 1020641 + - md5: 7fd6ca328d57505575914722eac276f7 + size: 1021704 path: wmd.csv diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 0a792ecb64c..c1df71cfdc6 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-04-13 + date_accessed: 2024-04-14 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index e79268b4901..12d2e379c3b 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-04-13 + date_accessed: 2024-04-14 publication_date: '2021-06-30' publication_year: 2021 published_by: |- From f26097d9a72c69e0a32ddd2f65317ce282998cd6 Mon Sep 17 00:00:00 2001 From: owidbot Date: Sun, 14 Apr 2024 04:04:47 +0000 Subject: [PATCH 13/40] :robot: automatic flunet update --- snapshots/who/latest/fluid.csv.dvc | 4 ++-- snapshots/who/latest/flunet.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index 7b19b1ff4f0..461578b668e 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: afdb4e941ee982a63ac479aad6869d2f - size: 150755257 + - md5: 70b53f02d9a64a282de4764cc9c2c897 + size: 150812561 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index df5035ce3bf..0ca209164bf 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: 5f7fb645f2cb0cf75314ab1f1473a106 - size: 25784618 + - md5: badd68913d1b5328ffbb03213adb69bf + size: 25794436 path: flunet.csv From 0452f4111017261a519c761d63f93d6b3da8a9bd Mon Sep 17 00:00:00 2001 From: Pablo Arriagada Date: Sun, 14 Apr 2024 23:38:13 -0400 Subject: [PATCH 14/40] :lipatick: make countries in Europe nan for years colonized --- .../data/garden/harvard/2023-09-18/colonial_dates_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py index 632347f3ea9..fa61e722384 100644 --- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py +++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py @@ -330,7 +330,7 @@ def correct_european_countries(tb: Table) -> Table: european_countries = geo.list_countries_in_region(region="Europe") # If the country is in european_countries and last_colonizer is not "zzzz. Never colonized", assign nan to colonizer - for col in ["colonizer", "colonizer_grouped", "last_colonizer", "last_colonizer_grouped"]: + for col in ["colonizer", "colonizer_grouped", "last_colonizer", "years_colonized", "last_colonizer_grouped"]: tb[col] = tb[col].where( ~((tb["country"].isin(european_countries)) & (tb["last_colonizer_grouped"] == "zzzz. Never colonized")), np.nan, From 41e15c8c6333763f273d9e7552b269a17155385b Mon Sep 17 00:00:00 2001 From: Pablo Arriagada Date: Sun, 14 Apr 2024 23:40:35 -0400 Subject: [PATCH 15/40] :lipstick: change metadata names --- .../garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml index 1f6e46694a1..550e22f18f0 100644 --- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml +++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml @@ -52,7 +52,7 @@ tables: conversionFactor: 1 years_colonized: - title: Years the country has been colonized + title: Years a country was an European overseas colony unit: "years" short_unit: "years" description_short: | @@ -64,7 +64,7 @@ tables: description_from_producer: "" processing_level: major display: - name: Years colonized + name: Years a country was an European overseas colony entityAnnotationsMap: "" numDecimalPlaces: 0 conversionFactor: 1 From 6deb85392de403efc03c2a02ef999bc90c29ce81 Mon Sep 17 00:00:00 2001 From: owidbot Date: Mon, 15 Apr 2024 04:03:46 +0000 Subject: [PATCH 16/40] :robot: automatic wildfires update --- snapshots/climate/latest/weekly_wildfires.csv.dvc | 4 ++-- snapshots/excess_mortality/latest/hmd_stmf.csv.dvc | 2 +- snapshots/excess_mortality/latest/wmd.csv.dvc | 2 +- snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc | 4 ++-- .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 860918472f4..1e9fbe61c30 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -9,8 +9,8 @@ meta: citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-04-14 - date_published: 2024-04-14 + date_accessed: 2024-04-15 + date_published: 2024-04-15 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index c1816336a5f..e17ec668748 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,7 +13,7 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-04-14 + date_accessed: 2024-04-15 publication_date: 2024-03-18 publication_year: 2024 published_by: |- diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index 8b61273d172..cc60e04075b 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-04-14 + date_accessed: 2024-04-15 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index c1df71cfdc6..3624b44c904 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-04-14 + date_accessed: 2024-04-15 publication_date: '2021-06-30' publication_year: 2021 published_by: |- @@ -22,6 +22,6 @@ meta: url: https://github.com/dkobak/excess-mortality/blob/main/LICENSE access_notes: Contains data by age. outs: - - md5: 32b49a01586cf505a65340762fe80e44 + - md5: 0550fc688c0264e1b59a24d2747cd209 size: 381802 path: xm_karlinsky_kobak.csv diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index 12d2e379c3b..e5bf7ad7629 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-04-14 + date_accessed: 2024-04-15 publication_date: '2021-06-30' publication_year: 2021 published_by: |- From ded92d067d98441759ca8b3109acd6fb1d222c2b Mon Sep 17 00:00:00 2001 From: owidbot Date: Mon, 15 Apr 2024 04:05:32 +0000 Subject: [PATCH 17/40] :robot: automatic flunet update --- snapshots/who/latest/fluid.csv.dvc | 2 +- snapshots/who/latest/flunet.csv.dvc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index 461578b668e..b688b891b6e 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: 70b53f02d9a64a282de4764cc9c2c897 + - md5: 81d668993ca1dba5c2dd9feeb5b82218 size: 150812561 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index 0ca209164bf..059c1a79e02 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: badd68913d1b5328ffbb03213adb69bf - size: 25794436 + - md5: c330a107ff283f6862d1775b81b2a3bf + size: 25796571 path: flunet.csv From db1055b9a8cc6efacfb7ca223496634ec1fd5755 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 15 Apr 2024 09:45:28 +0200 Subject: [PATCH 18/40] =?UTF-8?q?=F0=9F=93=8A=20Update=20co2=20dataset=20r?= =?UTF-8?q?eference=20(#2524)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Duplicate owid_co2 step * 📊 Update CO2 dataset (#2511) * Adapt owid_co2 dataset * Archive unused steps * Make owid_co2 version latest --- dag/archive/emissions.yml | 25 ++ dag/emissions.yml | 34 +- .../garden/emissions/latest/owid_co2.meta.yml | 9 + .../data/garden/emissions/latest/owid_co2.py | 360 ++++++++++++++++++ 4 files changed, 404 insertions(+), 24 deletions(-) create mode 100644 etl/steps/data/garden/emissions/latest/owid_co2.meta.yml create mode 100644 etl/steps/data/garden/emissions/latest/owid_co2.py diff --git a/dag/archive/emissions.yml b/dag/archive/emissions.yml index d9495ba2a29..cf0b2e82571 100644 --- a/dag/archive/emissions.yml +++ b/dag/archive/emissions.yml @@ -138,3 +138,28 @@ steps: - data://garden/wb/2023-04-30/income_groups data://grapher/gcp/2023-12-05/global_carbon_budget: - data://garden/gcp/2023-12-05/global_carbon_budget + # + # Emissions - CO2 dataset (2023-12-12). + # + data://garden/emissions/2023-12-12/owid_co2: + - data://garden/emissions/2023-11-23/national_contributions + - data://garden/gcp/2023-12-12/global_carbon_budget + - data://garden/climate_watch/2023-10-31/emissions_by_sector + - data://garden/energy/2023-12-12/primary_energy_consumption + - data://garden/demography/2023-03-31/population + - data://garden/ggdc/2020-10-01/ggdc_maddison + - data://garden/regions/2023-01-01/regions + # + # Jones et al. (2023) - National contributions to climate change. + # + data://meadow/emissions/2023-11-23/national_contributions: + - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv + - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv + - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv + data://garden/emissions/2023-11-23/national_contributions: + - data://meadow/emissions/2023-11-23/national_contributions + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population + - data://garden/wb/2023-04-30/income_groups + data://grapher/emissions/2023-11-23/national_contributions: + - data://garden/emissions/2023-11-23/national_contributions diff --git a/dag/emissions.yml b/dag/emissions.yml index 5849d3f8642..0b9e9250739 100644 --- a/dag/emissions.yml +++ b/dag/emissions.yml @@ -45,17 +45,6 @@ steps: data://grapher/gcp/2023-12-12/global_carbon_budget: - data://garden/gcp/2023-12-12/global_carbon_budget # - # Emissions - CO2 dataset (2023-12-12). - # - data://garden/emissions/2023-12-12/owid_co2: - - data://garden/emissions/2023-11-23/national_contributions - - data://garden/gcp/2023-12-12/global_carbon_budget - - data://garden/climate_watch/2023-10-31/emissions_by_sector - - data://garden/energy/2023-12-12/primary_energy_consumption - - data://garden/demography/2023-03-31/population - - data://garden/ggdc/2020-10-01/ggdc_maddison - - data://garden/regions/2023-01-01/regions - # # RFF - World Carbon Pricing (2022-09-14). # data://meadow/rff/2023-10-19/world_carbon_pricing: @@ -124,23 +113,20 @@ steps: - data://garden/regions/2023-01-01/regions data://grapher/emissions/2024-04-08/national_contributions: - data://garden/emissions/2024-04-08/national_contributions - - ###################################################################################################################### - # Older versions that should be archived once they are not used by any other steps. # - # Jones et al. (2023) - National contributions to climate change. + # Emissions - CO2 dataset. # - data://meadow/emissions/2023-11-23/national_contributions: - - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv - - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv - - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv - data://garden/emissions/2023-11-23/national_contributions: - - data://meadow/emissions/2023-11-23/national_contributions + data://garden/emissions/latest/owid_co2: + - data://garden/ggdc/2020-10-01/ggdc_maddison + - data://garden/energy/2023-12-12/primary_energy_consumption + - data://garden/emissions/2024-04-08/national_contributions - data://garden/regions/2023-01-01/regions - data://garden/demography/2023-03-31/population - - data://garden/wb/2023-04-30/income_groups - data://grapher/emissions/2023-11-23/national_contributions: - - data://garden/emissions/2023-11-23/national_contributions + - data://garden/climate_watch/2023-10-31/emissions_by_sector + - data://garden/gcp/2023-12-12/global_carbon_budget + + ###################################################################################################################### + # Older versions that should be archived once they are not used by any other steps. ###################################################################################################################### diff --git a/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml b/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml new file mode 100644 index 00000000000..d58145ee7f5 --- /dev/null +++ b/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml @@ -0,0 +1,9 @@ +dataset: + title: OWID CO2 dataset + description: | + OWID CO2 dataset. + + This dataset will be loaded by [the co2-data repository](https://github.com/owid/co2-data), to create a csv file of the dataset that can be downloaded in one click. + +# Dataset sources will be created in the step by combining all component datasets' sources. +# Also, table metadata will be built from the tables' original metadata. diff --git a/etl/steps/data/garden/emissions/latest/owid_co2.py b/etl/steps/data/garden/emissions/latest/owid_co2.py new file mode 100644 index 00000000000..33dd14f123c --- /dev/null +++ b/etl/steps/data/garden/emissions/latest/owid_co2.py @@ -0,0 +1,360 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. + +Datasets combined: +* Global Carbon Budget - Global Carbon Project. +* National contributions to climate change - Jones et al. +* Greenhouse gas emissions by sector - Climate Watch. +* Primary energy consumption - EI & EIA. + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2020) on +GDP are included. + +""" + + +import numpy as np +from owid.catalog import Dataset, Origin, Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor from tonnes to million tonnes. +TONNES_TO_MILLION_TONNES = 1e-6 + +# Select columns to use from each dataset, and how to rename them. +GCP_COLUMNS = { + "country": "country", + "year": "year", + "emissions_total": "co2", + "emissions_total_per_capita": "co2_per_capita", + "traded_emissions": "trade_co2", + "emissions_from_cement": "cement_co2", + "emissions_from_cement_per_capita": "cement_co2_per_capita", + "emissions_from_coal": "coal_co2", + "emissions_from_coal_per_capita": "coal_co2_per_capita", + "emissions_from_flaring": "flaring_co2", + "emissions_from_flaring_per_capita": "flaring_co2_per_capita", + "emissions_from_gas": "gas_co2", + "emissions_from_gas_per_capita": "gas_co2_per_capita", + "emissions_from_oil": "oil_co2", + "emissions_from_oil_per_capita": "oil_co2_per_capita", + "emissions_from_other_industry": "other_industry_co2", + "emissions_from_other_industry_per_capita": "other_co2_per_capita", + "pct_growth_emissions_total": "co2_growth_prct", + "growth_emissions_total": "co2_growth_abs", + "emissions_total_per_gdp": "co2_per_gdp", + "emissions_total_per_unit_energy": "co2_per_unit_energy", + "consumption_emissions": "consumption_co2", + "consumption_emissions_per_capita": "consumption_co2_per_capita", + "consumption_emissions_per_gdp": "consumption_co2_per_gdp", + "cumulative_emissions_total": "cumulative_co2", + "cumulative_emissions_from_cement": "cumulative_cement_co2", + "cumulative_emissions_from_coal": "cumulative_coal_co2", + "cumulative_emissions_from_flaring": "cumulative_flaring_co2", + "cumulative_emissions_from_gas": "cumulative_gas_co2", + "cumulative_emissions_from_oil": "cumulative_oil_co2", + "cumulative_emissions_from_other_industry": "cumulative_other_co2", + "pct_traded_emissions": "trade_co2_share", + "emissions_total_as_share_of_global": "share_global_co2", + "emissions_from_cement_as_share_of_global": "share_global_cement_co2", + "emissions_from_coal_as_share_of_global": "share_global_coal_co2", + "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", + "emissions_from_gas_as_share_of_global": "share_global_gas_co2", + "emissions_from_oil_as_share_of_global": "share_global_oil_co2", + "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", + "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", + "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", + "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", + "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", + "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", + "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", + "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", + # New variables, related to land-use change emissions. + "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", + "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", + "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", + "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", + "emissions_from_land_use_change": "land_use_change_co2", + "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", + "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", + "emissions_total_including_land_use_change": "co2_including_luc", + "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", + "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", + "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", + "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", + "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", + "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", +} +JONES_COLUMNS = { + "country": "country", + "year": "year", + "temperature_response_co2_total": "temperature_change_from_co2", + "temperature_response_ghg_total": "temperature_change_from_ghg", + "temperature_response_ch4_total": "temperature_change_from_ch4", + "temperature_response_n2o_total": "temperature_change_from_n2o", + "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", +} +CLIMATE_WATCH_GHG_COLUMNS = { + "country": "country", + "year": "year", + "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf", + "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita", + "total_ghg_emissions_including_lucf": "total_ghg", + "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita", +} +CLIMATE_WATCH_CH4_COLUMNS = { + "country": "country", + "year": "year", + "total_ch4_emissions_including_lucf": "methane", + "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita", +} +CLIMATE_WATCH_N2O_COLUMNS = { + "country": "country", + "year": "year", + "total_n2o_emissions_including_lucf": "nitrous_oxide", + "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita", +} +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", + "primary_energy_consumption_per_capita__kwh": "energy_per_capita", + "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", +} +REGIONS_COLUMNS = { + "name": "country", + "iso_alpha3": "iso_code", +} +POPULATION_COLUMNS = { + "country": "country", + "year": "year", + "population": "population", +} +GDP_COLUMNS = { + "country": "country", + "year": "year", + "gdp": "gdp", +} + +UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}} + + +def convert_units(table: Table) -> Table: + """Convert units of table. + + Parameters + ---------- + table : Table + Data with its original units. + + Returns + ------- + Table + Data after converting units of specific columns. + + """ + table = table.copy() + # Check units and convert to more convenient ones. + for column in table.columns: + unit = table[column].metadata.unit + title = table[column].metadata.title + description_short = table[column].metadata.description or table[column].metadata.description_short + if unit in list(UNITS): + table[column] *= UNITS[unit]["conversion"] + table[column].metadata.unit = UNITS[unit]["new_unit"] + table[column].metadata.short_unit = UNITS[unit]["new_short_unit"] + table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"]) + table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"]) + + return table + + +def combine_tables( + tb_gcp: Table, + tb_jones: Table, + tb_climate_watch_ghg: Table, + tb_climate_watch_ch4: Table, + tb_climate_watch_n2o: Table, + tb_energy: Table, + tb_gdp: Table, + tb_population: Table, + tb_regions: Table, +) -> Table: + """Combine tables. + + Parameters + ---------- + tb_gcp : Table + Global Carbon Budget table (from Global Carbon Project). + tb_jones : Table + National contributions to climate change (from Jones et al. (2023)). + tb_climate_watch_ghg : Table + Greenhouse gas emissions table (from Climate Watch). + tb_climate_watch_ch4 : Table + CH4 emissions table (from Climate Watch). + tb_climate_watch_n2o : Table + N2O emissions table (from Climate Watch). + tb_energy : Table + Primary energy consumption table (from BP & EIA). + tb_gdp : Table + Maddison GDP table (from GGDC). + tb_population : Table + OWID population table (from various sources). + tb_regions : Table + OWID regions table. + + Returns + ------- + combined : Table + Combined table with metadata and variables metadata. + + """ + # Combine main tables (with an outer join, to gather all entities from all tables). + combined = tb_gcp.copy() + for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]: + combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Add secondary tables (with a left join, to keep only entities for which we have emissions data). + for table in [tb_energy, tb_gdp, tb_population]: + combined = combined.merge(table, on=["country", "year"], how="left") + + # Countries-regions dataset does not have a year column, so it has to be merged on country. + combined = combined.merge(tb_regions, on="country", how="left") + + # Check that there were no repetition in column names. + error = "Repeated columns in combined data." + assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error + + # Adjust units. + combined = convert_units(combined) + + return combined + + +def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table: + """Clean and prepare output table. + + Parameters + ---------- + combined : Table + Combined table. + ds_regions : Dataset + Regions dataset, only used to get its version. + + Returns + ------- + combined: Table + Cleaned combined table. + + """ + # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). + columns_that_must_have_data = [ + column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] + ] + combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) + + # Add metadata to the ISO column (loaded from the regions dataset). + combined["iso_code"].m.origins = [ + Origin( + producer="International Organization for Standardization", + title="Regions", + date_published=ds_regions.version, + ) + ] + combined["iso_code"].metadata.title = "ISO code" + combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes." + combined["iso_code"].metadata.unit = "" + + # Sanity check. + columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] + assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" + + # Set index and sort conveniently. + combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() + + return combined + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the global carbon budget dataset from the Global Carbon Project (GCP). + ds_gcp = paths.load_dataset("global_carbon_budget") + + # Load the Jones et al. (2023) dataset on national contributions to climate change. + ds_jones = paths.load_dataset("national_contributions") + + # Load the greenhouse gas emissions by sector dataset by Climate Watch. + ds_climate_watch = paths.load_dataset("emissions_by_sector") + + # Load the GDP dataset by GGDC Maddison. + ds_gdp = paths.load_dataset("ggdc_maddison") + + # Load primary energy consumption dataset (by different sources in our 'energy' namespace). + ds_energy = paths.load_dataset("primary_energy_consumption") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load countries-regions dataset (required to get ISO codes). + ds_regions = paths.load_dataset("regions") + + # Gather all required tables from all datasets. + tb_gcp = ds_gcp["global_carbon_budget"] + tb_jones = ds_jones["national_contributions"] + tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"] + tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"] + tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"] + tb_energy = ds_energy["primary_energy_consumption"] + tb_gdp = ds_gdp["maddison_gdp"] + tb_population = ds_population["population"] + tb_regions = ds_regions["regions"] + + # + # Process data. + # + # Choose required columns and rename them. + tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") + tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") + tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename( + columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise" + ) + tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename( + columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise" + ) + tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename( + columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise" + ) + tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( + columns=PRIMARY_ENERGY_COLUMNS, errors="raise" + ) + tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") + tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( + columns=POPULATION_COLUMNS, errors="raise" + ) + tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") + + # Combine tables. + combined = combine_tables( + tb_gcp=tb_gcp, + tb_jones=tb_jones, + tb_climate_watch_ghg=tb_climate_watch_ghg, + tb_climate_watch_ch4=tb_climate_watch_ch4, + tb_climate_watch_n2o=tb_climate_watch_n2o, + tb_energy=tb_energy, + tb_gdp=tb_gdp, + tb_population=tb_population, + tb_regions=tb_regions, + ) + + # Prepare outputs. + combined = prepare_outputs(combined=combined, ds_regions=ds_regions) + + # + # Save outputs. + # + ds_garden = create_dataset(dest_dir, tables=[combined], check_variables_metadata=True) + ds_garden.save() From 0337875934e66a61ef74d7deab3c5bbc75951732 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Mon, 15 Apr 2024 10:33:17 +0200 Subject: [PATCH 19/40] =?UTF-8?q?=F0=9F=90=9B=20Fix=20faostat=20animals=20?= =?UTF-8?q?data=20(#2506)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix FAOSTAT aggregate of total meat * Improve documentation --- .../garden/faostat/2024-03-14/faostat_qcl.py | 66 ++++++++++++------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index 3c64dab36c9..6e8e4687417 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -1,6 +1,5 @@ """FAOSTAT garden step for faostat_qcl dataset.""" -from pathlib import Path import numpy as np import owid.catalog.processing as pr @@ -31,18 +30,39 @@ ITEM_CODE_MEAT_POULTRY = "00001808" # Item code for 'Meat, chicken'. ITEM_CODE_MEAT_CHICKEN = "00001058" -# List item codes to sum as part of "Meat, total" (avoiding double-counting items). +# Given that the number of slaughtered animals to produce all meat is not provided, we estimate it by aggregating the +# number of slaughtered animals for each meat item. +# List item codes to sum as part of "Meat, Total" (avoiding double-counting items). +# This list can be found following these steps: +# * Go to: https://www.fao.org/faostat/en/#definitions +# * Click on "Item Group" (on the left column). +# * Type "1765" in the search bar (which is the Item Group Code corresponding to Item Group "Meat, Total"). +# * Download the output of the search as a CSV file. +# * Open the file, filter by Item Group Code "1765" and Domain Code "QCL". +# * The list of item codes and item names are in columns "Item Code" and "Item". MEAT_TOTAL_ITEM_CODES = [ - "00000977", # 'Meat, lamb and mutton' (previously 'Meat, lamb and mutton') - "00001035", # 'Meat of pig with the bone, fresh or chilled' (previously 'Meat, pig') - "00001097", # 'Horse meat, fresh or chilled' (previously 'Meat, horse') - "00001108", # 'Meat of asses, fresh or chilled' (previously 'Meat, ass') - "00001111", # 'Meat of mules, fresh or chilled' (previously 'Meat, mule') - "00001127", # 'Meat of camels, fresh or chilled' (previously 'Meat, camel') - "00001141", # 'Meat of rabbits and hares, fresh or chilled' (previously 'Meat, rabbit') - "00001806", # 'Meat, beef and buffalo' (previously 'Meat, beef and buffalo') - "00001807", # 'Meat, sheep and goat' (previously 'Meat, sheep and goat') - ITEM_CODE_MEAT_POULTRY, # 'Meat, poultry' (previously 'Meat, poultry') + "00001058", # 'Meat of chickens, fresh or chilled', + "00001069", # 'Meat of ducks, fresh or chilled', + "00001035", # 'Meat of pig with the bone, fresh or chilled', + "00001017", # 'Meat of goat, fresh or chilled', + "00000977", # 'Meat of sheep, fresh or chilled', + "00000867", # 'Meat of cattle with the bone, fresh or chilled', + "00000947", # 'Meat of buffalo, fresh or chilled', + "00001127", # 'Meat of camels, fresh or chilled', + "00001097", # 'Horse meat, fresh or chilled', + "00001080", # 'Meat of turkeys, fresh or chilled', + "00001141", # 'Meat of rabbits and hares, fresh or chilled', + "00001163", # 'Game meat, fresh, chilled or frozen', + "00001108", # 'Meat of asses, fresh or chilled', + "00001073", # 'Meat of geese, fresh or chilled', + "00001111", # 'Meat of mules, fresh or chilled', + "00001166", # 'Other meat n.e.c. (excluding mammals), fresh, chilled or frozen', + "00001158", # 'Meat of other domestic camelids, fresh or chilled', + "00001151", # 'Meat of other domestic rodents, fresh or chilled', + "00001089", # 'Meat of pigeons and other birds n.e.c., fresh, chilled or frozen', + "00001176", # 'Snails, fresh, chilled, frozen, dried, salted or in brine, except sea snails', + # Items that were in the list of "Meat, Total", but were not in the data: + # "00001083", # 'Other birds', ] # List of element codes for "Producing or slaughtered animals" (they have different items assigned). @@ -163,17 +183,10 @@ def add_slaughtered_animals_to_meat_total(tb: Table) -> Table: error = f"Some items required to get the aggregate '{TOTAL_MEAT_ITEM}' are missing in data." assert set(MEAT_TOTAL_ITEM_CODES) < set(tb["item_code"]), error - assert SLAUGHTERED_ANIMALS_ELEMENT in tb["element"].unique() - assert SLAUGHTERED_ANIMALS_UNIT in tb["unit"].unique() + assert SLAUGHTERED_ANIMALS_ELEMENT in set(tb["element"]) + assert SLAUGHTERED_ANIMALS_UNIT in set(tb["unit"]) - # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data. - assert tb[ - (tb["item"] == TOTAL_MEAT_ITEM) - & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT) - ].empty - - # There are two element codes for the same element (they have different items assigned). + # Check that there are two element codes for the same element (they have different items assigned). error = "Element codes for 'Producing or slaughtered animals' may have changed." assert ( tb[(tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) & ~(tb["element_code"].str.contains("pc"))]["element_code"] @@ -182,6 +195,13 @@ def add_slaughtered_animals_to_meat_total(tb: Table) -> Table: == SLAUGHTERED_ANIMALS_ELEMENT_CODES ), error + # Check that they use the same unit. + error = "Unit for element 'Producing or slaughtered animals' may have changed." + assert set(tb[(tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT)]["unit"]) == set(["animals"]), error + + # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data. + assert tb[(tb["item"] == TOTAL_MEAT_ITEM) & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT)].empty + # Check that the items assigned to each the two element codes do not overlap. error = "Element codes for 'Producing or slaughtered animals' have overlapping items." items_for_different_elements = ( @@ -437,7 +457,7 @@ def run(dest_dir: str) -> None: # Load data. # # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name + dataset_short_name = f"{NAMESPACE}_qcl" # Define path to current step file. current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") From f75d846421199725199ff33fc9092d0c42cf1de2 Mon Sep 17 00:00:00 2001 From: veronikasamborska1994 <32176660+veronikasamborska1994@users.noreply.github.com> Date: Mon, 15 Apr 2024 09:43:26 +0100 Subject: [PATCH 20/40] Copernicus April update (#2523) --- dag/climate.yml | 2 +- .../2024-04-12/surface_temperature.gz.dvc | 27 +++++++++ .../climate/2024-04-12/surface_temperature.py | 60 +++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 snapshots/climate/2024-04-12/surface_temperature.gz.dvc create mode 100644 snapshots/climate/2024-04-12/surface_temperature.py diff --git a/dag/climate.yml b/dag/climate.yml index 0dcb469b54a..fd50b88cbeb 100644 --- a/dag/climate.yml +++ b/dag/climate.yml @@ -39,7 +39,7 @@ steps: # Copernicus Climate Change Service - Surface temperature. # data://meadow/climate/2023-12-20/surface_temperature: - - snapshot://climate/2024-03-12/surface_temperature.gz + - snapshot://climate/2024-04-12/surface_temperature.gz - snapshot://countries/2023-12-27/world_bank.zip data://garden/climate/2023-12-20/surface_temperature: - data://meadow/climate/2023-12-20/surface_temperature diff --git a/snapshots/climate/2024-04-12/surface_temperature.gz.dvc b/snapshots/climate/2024-04-12/surface_temperature.gz.dvc new file mode 100644 index 00000000000..15306cfe5dc --- /dev/null +++ b/snapshots/climate/2024-04-12/surface_temperature.gz.dvc @@ -0,0 +1,27 @@ +meta: + origin: + title_snapshot: ERA5 Monthly Averaged Data on Single Levels from 1940 to Present - Monthly Averages of 2m Surface Temperature + title: ERA5 monthly averaged data on single levels from 1940 to present + description: |- + ERA5 is the latest climate reanalysis produced by ECMWF, providing hourly data on many atmospheric, land-surface and sea-state parameters together with estimates of uncertainty. + + ERA5 data are available in the Climate Data Store on regular latitude-longitude grids at 0.25° x 0.25° resolution, with atmospheric parameters on 37 pressure levels. + + ERA5 is available from 1940 and continues to be extended forward in time, with daily updates being made available 5 days behind real time + + Initial release data, i.e., data no more than three months behind real time, are called ERA5T. + producer: Copernicus Climate Change Service + version_producer: 2 + citation_full: |- + Hersbach, H., Bell, B., Berrisford, P., Biavati, G., Horányi, A., Muñoz Sabater, J., Nicolas, J., Peubey, C., Radu, R., Rozum, I., Schepers, D., Simmons, A., Soci, C., Dee, D., Thépaut, J-N. (2023): ERA5 monthly averaged data on single levels from 1940 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.f17050d7 (Accessed on 13-Feb-2024) + url_main: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview + date_accessed: 2024-04-12 + date_published: 2019-04-18 + license: + name: Copernicus License + url: https://cds.climate.copernicus.eu/api/v2/terms/static/licence-to-use-copernicus-products.pdf + +outs: + - md5: 715bc2845ee71aaf0dca48dfec2f2c58 + size: 1702330442 + path: surface_temperature.gz diff --git a/snapshots/climate/2024-04-12/surface_temperature.py b/snapshots/climate/2024-04-12/surface_temperature.py new file mode 100644 index 00000000000..3cc66fcba72 --- /dev/null +++ b/snapshots/climate/2024-04-12/surface_temperature.py @@ -0,0 +1,60 @@ +"""Script to create a snapshot of the monthly averaged surface temperature data from 1950 to present from the Copernicus Climate Change Service. + + The script assumes that the data is available on the CDS API. + Instructions on how to access the API on a Mac are here: https://confluence.ecmwf.int/display/CKB/How+to+install+and+use+CDS+API+on+macOS + + More information on how to access the data is here: https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=overview + + The data is downloaded as a NetCDF file. Tutorials for using the Copernicus API are here and work with the NETCDF format are here: https://ecmwf-projects.github.io/copernicus-training-c3s/cds-tutorial.html + """ + +import gzip +import shutil +import tempfile +from pathlib import Path + +# CDS API +import cdsapi +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/surface_temperature.gz") + # Save data as a compressed temporary file. + with tempfile.TemporaryDirectory() as temp_dir: + c = cdsapi.Client() + output_file = Path(temp_dir) / "era5_monthly_t2m_eur.nc" + + c.retrieve( + "reanalysis-era5-single-levels-monthly-means", + { + "product_type": "monthly_averaged_reanalysis", + "variable": "2m_temperature", + "year": [str(year) for year in range(1940, 2025)], + "month": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"], + "time": "00:00", + "area": [90, -180, -90, 180], + "format": "netcdf", + }, + output_file, + ) + # Compress the file + with open(output_file, "rb") as f_in: + with gzip.open(str(output_file) + ".gz", "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + gzip_file = str(output_file) + ".gz" + # Upload snapshot. + snap.create_snapshot(filename=gzip_file, upload=upload) + + +if __name__ == "__main__": + main() From 12f2c35f8a23567bcafdb63ed1bc912f6899b83c Mon Sep 17 00:00:00 2001 From: owidbot Date: Mon, 15 Apr 2024 10:04:57 +0000 Subject: [PATCH 21/40] fasttrack: fasttrack/latest/agricultural_policies_wuepper.csv --- .../fasttrack/latest/agricultural_policies_wuepper.csv.dvc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/snapshots/fasttrack/latest/agricultural_policies_wuepper.csv.dvc b/snapshots/fasttrack/latest/agricultural_policies_wuepper.csv.dvc index 07170a872be..1def06c0978 100644 --- a/snapshots/fasttrack/latest/agricultural_policies_wuepper.csv.dvc +++ b/snapshots/fasttrack/latest/agricultural_policies_wuepper.csv.dvc @@ -8,11 +8,11 @@ meta: url_main: https://www.nature.com/articles/s43016-024-00945-8 url_download: |- https://docs.google.com/spreadsheets/d/e/2PACX-1vTxYlN0qXIUUXEok-T_1QmKXw-9eGRG1cQD2EKQSrC8kpWiI_C_f0oJFKU3SuOksmPKjxIS3UQwpU8l/pub?output=csv - date_accessed: '2024-03-26' + date_accessed: '2024-04-15' name: Agricultural policies (Wuepper et al. 2024) description: The number and stringency of agricultural-environmental policies by country. license: {} outs: - - md5: 08b93261bfecce404931598400f219f5 - size: 10062 + - md5: 616cb47603ee7f243557db1dd54b02e1 + size: 10089 path: agricultural_policies_wuepper.csv From c54f84b339c74196f9c2c704654b9cd6ad9e0535 Mon Sep 17 00:00:00 2001 From: owidbot Date: Mon, 15 Apr 2024 11:50:55 +0000 Subject: [PATCH 22/40] fasttrack: fasttrack/latest/conflict_deaths_combined.csv --- dag/fasttrack.yml | 2 ++ .../latest/conflict_deaths_combined.meta.yml | 23 +++++++++++++++++++ .../latest/conflict_deaths_combined.py | 22 ++++++++++++++++++ .../latest/conflict_deaths_combined.csv.dvc | 14 +++++++++++ 4 files changed, 61 insertions(+) create mode 100644 etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.meta.yml create mode 100644 etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.py create mode 100644 snapshots/fasttrack/latest/conflict_deaths_combined.csv.dvc diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml index 9ad5e3fc844..8912820bea8 100644 --- a/dag/fasttrack.yml +++ b/dag/fasttrack.yml @@ -156,3 +156,5 @@ steps: - snapshot://fasttrack/latest/global_precipitation_anomaly_noaa.csv data://grapher/fasttrack/latest/gpei: - snapshot://fasttrack/latest/gpei.csv + data-private://grapher/fasttrack/latest/conflict_deaths_combined: + - snapshot-private://fasttrack/latest/conflict_deaths_combined.csv diff --git a/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.meta.yml b/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.meta.yml new file mode 100644 index 00000000000..72a28530d02 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.meta.yml @@ -0,0 +1,23 @@ +dataset: + title: DRAFT conflict_deaths_combined + description: '' + licenses: + - {} +tables: + conflict_deaths_combined: + variables: + onesided_deaths: + title: onesided_deaths + unit: '' + nonstate_deaths: + title: nonstate_deaths + unit: '' + intrastate_deaths: + title: intrastate_deaths + unit: '' + interstate_deaths: + title: interstate_deaths + unit: '' + all_deaths: + title: all_deaths + unit: '' diff --git a/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.py b/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.py new file mode 100644 index 00000000000..f8475e892c6 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/latest/conflict_deaths_combined.py @@ -0,0 +1,22 @@ +from etl.helpers import PathFinder, create_dataset, get_metadata_path +from etl.snapshot import Snapshot + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # load snapshot + snap = Snapshot("fasttrack/latest/conflict_deaths_combined.csv") + + # load data + tb = snap.read_csv() + + # add table, update metadata from *.meta.yml and save + ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + + ds.save() diff --git a/snapshots/fasttrack/latest/conflict_deaths_combined.csv.dvc b/snapshots/fasttrack/latest/conflict_deaths_combined.csv.dvc new file mode 100644 index 00000000000..821eb2d7286 --- /dev/null +++ b/snapshots/fasttrack/latest/conflict_deaths_combined.csv.dvc @@ -0,0 +1,14 @@ +meta: + source: + name: Unknown + url: '' + source_data_url: gAAAAABmHRSaP_x2DAk2mJbIu3t0K1qg0C5hYhRTgMa6gF8O32w49j8YI13vwsllJ0cbUDM38yH9wMiXjVID6LWrSb8QgiKpiOO6c-JM5uc-4ZnKFV_oFR0= + date_accessed: '2024-04-15' + name: DRAFT conflict_deaths_combined + description: '' + license: {} + is_public: false +outs: + - md5: 76da540ac83b8df947ffa30f7655b9cb + size: 377 + path: conflict_deaths_combined.csv From f99f4cffc4958240c4cea895bb5b8923bff72f2a Mon Sep 17 00:00:00 2001 From: Marigold Date: Mon, 15 Apr 2024 15:35:50 +0200 Subject: [PATCH 23/40] :hammer: exclude country_profile from datadiff --- apps/owidbot/etldiff.py | 2 +- .../unep/2023-03-17/consumption_controlled_substances.meta.yml | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py index 261feb56da0..a721b43bd0f 100644 --- a/apps/owidbot/etldiff.py +++ b/apps/owidbot/etldiff.py @@ -17,7 +17,7 @@ log = structlog.get_logger() -EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet" +EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet|country_profile" @click.command(name="owidbot-etl-diff", cls=RichCommand, help=__doc__) diff --git a/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml b/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml index d7c3699b9a3..919ca832bd0 100644 --- a/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml +++ b/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml @@ -17,9 +17,6 @@ dataset: Negative values for a given year imply that quantities destroyed or quantities exported for the year exceeded the sum of production and imports, implying that the destroyed or exported quantities came from stockpiles. - licenses: - - name: # Example: Testing License Name - url: # Example: https://url_of_testing_source.com/license sources: - *source-testing From 3a5594d620cbe3142efc1a2d066b863f18dcc468 Mon Sep 17 00:00:00 2001 From: Pablo Arriagada <63430031+paarriagadap@users.noreply.github.com> Date: Mon, 15 Apr 2024 13:57:45 -0400 Subject: [PATCH 24/40] =?UTF-8?q?=F0=9F=93=8A=20wb:=20Update=20World=20Ban?= =?UTF-8?q?k=20PIP=20to=20version=2020240326=20(#2464)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update World Bank PIP * :bug: fix index order of the check * :bug: create inc and cons spells without losing data * :bug: fix issue with Poland income and consumption data * :lipstick: add comments and description_processing about Poland * :bug: fix text about Poland --- dag/poverty_inequality.yml | 22 +- etl/steps/data/garden/wb/2024-03-27/shared.py | 845 +++++++++ .../2024-03-27/world_bank_pip.countries.json | 181 ++ .../wb/2024-03-27/world_bank_pip.meta.yml | 473 +++++ .../garden/wb/2024-03-27/world_bank_pip.py | 1203 +++++++++++++ .../world_bank_pip_2011ppp.meta.yml | 4 + .../wb/2024-03-27/world_bank_pip_2011ppp.py | 34 + .../world_bank_pip_2017ppp.meta.yml | 4 + .../wb/2024-03-27/world_bank_pip_2017ppp.py | 34 + .../meadow/wb/2024-03-27/world_bank_pip.py | 51 + snapshots/wb/2024-03-27/pip_api.py | 1573 +++++++++++++++++ .../wb/2024-03-27/world_bank_pip.csv.dvc | 31 + snapshots/wb/2024-03-27/world_bank_pip.py | 36 + .../world_bank_pip_percentiles.csv.dvc | 33 + .../2024-03-27/world_bank_pip_percentiles.py | 25 + 15 files changed, 4538 insertions(+), 11 deletions(-) create mode 100644 etl/steps/data/garden/wb/2024-03-27/shared.py create mode 100644 etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json create mode 100644 etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml create mode 100644 etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py create mode 100644 etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml create mode 100644 etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py create mode 100644 etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml create mode 100644 etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py create mode 100644 etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py create mode 100644 snapshots/wb/2024-03-27/pip_api.py create mode 100644 snapshots/wb/2024-03-27/world_bank_pip.csv.dvc create mode 100644 snapshots/wb/2024-03-27/world_bank_pip.py create mode 100644 snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc create mode 100644 snapshots/wb/2024-03-27/world_bank_pip_percentiles.py diff --git a/dag/poverty_inequality.yml b/dag/poverty_inequality.yml index d44b0e1f3b0..d156dd48257 100644 --- a/dag/poverty_inequality.yml +++ b/dag/poverty_inequality.yml @@ -7,24 +7,24 @@ steps: # Poverty and inequality file for Joe's PhD data://explorers/poverty_inequality/latest/poverty_inequality_export: - - data://garden/wb/2024-01-17/world_bank_pip + - data://garden/wb/2024-03-27/world_bank_pip - data://garden/wid/2023-08-24/world_inequality_database - data://garden/lis/2023-08-30/luxembourg_income_study - data://garden/wb/2024-01-22/thousand_bins_distribution - data://garden/worldbank_wdi/2023-05-29/wdi # World Bank Poverty and Inequality Platform - data://meadow/wb/2024-01-17/world_bank_pip: - - snapshot://wb/2024-01-17/world_bank_pip.csv - - snapshot://wb/2024-01-17/world_bank_pip_percentiles.csv - data://garden/wb/2024-01-17/world_bank_pip: - - data://meadow/wb/2024-01-17/world_bank_pip - data://grapher/wb/2024-01-17/world_bank_pip_2011ppp: - - data://garden/wb/2024-01-17/world_bank_pip - data://grapher/wb/2024-01-17/world_bank_pip_2017ppp: - - data://garden/wb/2024-01-17/world_bank_pip + data://meadow/wb/2024-03-27/world_bank_pip: + - snapshot://wb/2024-03-27/world_bank_pip.csv + - snapshot://wb/2024-03-27/world_bank_pip_percentiles.csv + data://garden/wb/2024-03-27/world_bank_pip: + - data://meadow/wb/2024-03-27/world_bank_pip + data://grapher/wb/2024-03-27/world_bank_pip_2011ppp: + - data://garden/wb/2024-03-27/world_bank_pip + data://grapher/wb/2024-03-27/world_bank_pip_2017ppp: + - data://garden/wb/2024-03-27/world_bank_pip data://explorers/wb/latest/world_bank_pip: - - data://garden/wb/2024-01-17/world_bank_pip + - data://garden/wb/2024-03-27/world_bank_pip # World Inequality Database data://meadow/wid/2023-08-24/world_inequality_database: diff --git a/etl/steps/data/garden/wb/2024-03-27/shared.py b/etl/steps/data/garden/wb/2024-03-27/shared.py new file mode 100644 index 00000000000..4bf8096f67d --- /dev/null +++ b/etl/steps/data/garden/wb/2024-03-27/shared.py @@ -0,0 +1,845 @@ +""" +This file includes functions to get variables metadata in the `world_bank_pip` garden step +If new poverty lines or indicators are included, they need to be addressed here +""" + +from owid.catalog import Table, VariableMeta, VariablePresentationMeta + +# This is text to include in description_key and description_processing fields + +non_market_income_description = "Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account." + +processing_description_relative_poverty = "Measures of relative poverty are not directly available in the World Bank PIP data. To calculate this metric we take the median income or consumption for the country and year, calculate a relative poverty line – in this case {povline} of the median – and then run a specific query on the PIP API to return the share of population below that line." + +processing_description_thr = "Income and consumption thresholds by decile are not directly available in the World Bank PIP API. We extract the metric primarily from [auxiliary percentiles data provided by the World Bank](https://datacatalog.worldbank.org/search/dataset/0063646). Missing country values and regional aggregations of the indicator are calculated by running multiple queries on the API to obtain the closest poverty line to each threshold." + +processing_description_avg = "Income and consumption averages by decile are not directly available in the World Bank PIP API. We calculate the metric by multiplying the share of each decile by the mean income or consumption of the distribution and dividing by the population share of the decile (10%)." + +relative_poverty_description = "This is a measure of _relative_ poverty – it captures the share of people whose income is low by the standards typical in their own country." + +ppp_description = "The data is measured in international-$ at {ppp} prices – this adjusts for inflation and for differences in the cost of living between countries." + +processing_description_thr_percentiles = "Missing country values and regional aggregations of the threshold indicator are calculated by running multiple queries on the API to obtain the closest poverty line to each threshold. This data is merged with the percentile files [provided by the World Bank](https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles)." + + +# Define default tolerance for each variable +TOLERANCE = 5 + +# These are parameters specifically defined for each type of variable +var_dict = { + # POVERTY + "headcount": { + "title": "Number in poverty", + "description": "Number of people in households with an {inc_cons_dict[wel]['name']} per person below {povline}", + "unit": "people", + "short_unit": "", + "numDecimalPlaces": 0, + }, + "headcount_ratio": { + "title": "Share of population in poverty", + "description": "Percentage of population living in households with an {inc_cons_dict[wel]['name']} per person below {povline}", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "total_shortfall": { + "title": "Total daily shortfall", + "description": "This is the amount of money that would be theoretically needed to lift the {inc_cons_dict[wel]['name']} of all people in poverty up to {povline}. However, this is not a measure of the actual cost of eliminating poverty, since it does not take into account the costs involved in making the necessary transfers nor any changes in behaviour they would bring about.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "avg_shortfall": { + "title": "Average shortfall ($)", + "description": "This is the amount of money that would be theoretically needed to lift the {inc_cons_dict[wel]['name']} of all people in poverty up to {povline}, averaged across the population in poverty.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "income_gap_ratio": { + "title": "Average shortfall (%)", + "description": "This is the average shortfall expressed as a share of the poverty line, sometimes called the 'income gap ratio'. It captures the depth of poverty of those living on less than {povline}.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "poverty_gap_index": { + "title": "Poverty gap index", + "description": "The poverty gap index is a poverty measure that reflects both the prevalence and the depth of poverty. It is calculated as the share of population in poverty multiplied by the average shortfall from the poverty line (expressed as a % of the poverty line).", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "watts": { + "title": "Watts index", + "description": "This is the mean across the population of the proportionate poverty gaps, as measured by the log of the ratio of the poverty line to income, where the mean is formed over the whole population, counting the nonpoor as having a zero poverty gap.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "poverty_severity": { + "title": "Poverty severity", + "description": "It is calculated as the square of the income gap ratio, the average shortfall expressed as a share of the poverty line.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + # INEQUALITY + "gini": { + "title": "Gini coefficient", + "description": "The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 2, + }, + "palma_ratio": { + "title": "Palma ratio", + "description": "The Palma ratio is a measure of inequality that divides the share received by the richest 10% by the share of the poorest 40%. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "s80_s20_ratio": { + "title": "S80/S20 ratio", + "description": "The S80/S20 ratio is a measure of inequality that divides the share received by the richest 20% by the share of the poorest 20%. Higher values indicate higher inequality.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p90_p10_ratio": { + "title": "P90/P10 ratio", + "description": "P90 and P10 are the levels of {inc_cons_dict[wel]['name']} below which 90% and 10% of the population live, respectively. This variable gives the ratio of the two. It is a measure of inequality that indicates the gap between the richest and poorest tenth of the population.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p90_p50_ratio": { + "title": "P90/P50 ratio", + "description": "The P90/P50 ratio measures the degree of inequality within the richest half of the population. A ratio of 2 means that someone just falling in the richest tenth of the population has twice the median {inc_cons_dict[wel]['name']}.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "p50_p10_ratio": { + "title": "P50/P10 ratio", + "description": "The P50/P10 ratio measures the degree of inequality within the poorest half of the population. A ratio of 2 means that the median {inc_cons_dict[wel]['name']} is two times higher than that of someone just falling in the poorest tenth of the population.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 1, + }, + "mld": { + "title": "Mean log deviation", + "description": "The mean log deviation (MLD) is a measure of inequality. An MLD of zero indicates perfect equality and it takes on larger positive values as incomes become more unequal.", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 2, + }, + "polarization": { + "title": "Polarization index", + "description": "The polarization index, also known as the Wolfson polarization index, measures the extent to which the distribution of income or consumption is “spread out” and bi-modal. Like the Gini coefficient, the polarization index ranges from 0 (no polarization) to 1 (complete polarization).", + "unit": "", + "short_unit": "", + "numDecimalPlaces": 2, + }, + # DISTRIBUTIONAL INDICATORS + "mean": { + "title": "Mean", + "description": "Mean {inc_cons_dict[wel]['name']}.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "median": { + "title": "Median", + "description": "Median {inc_cons_dict[wel]['name']}.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "avg": { + "title": "Average", + "description": "The mean {inc_cons_dict[wel]['name_distribution']} per year within the {pct_dict[pct]['decile10']} (tenth of the population).", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "share": { + "title": "Share", + "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the {pct_dict[pct]['decile10']} (tenth of the population).", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "thr": { + "title": "Threshold", + "description": "The level of {inc_cons_dict[wel]['name_distribution']} per year below which {str(pct)}% of the population falls.", + "unit": "international-$ in {ppp} prices", + "short_unit": "$", + "numDecimalPlaces": 2, + }, + "bottom50_share": { + "title": "Share of the bottom 50%", + "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the poorest 50%.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, + "middle40_share": { + "title": "Share of the middle 40%", + "description": "The share of {inc_cons_dict[wel]['name_distribution']} {inc_cons_dict[wel]['verb']} by the middle 40%. The middle 40% is the share of the population whose {inc_cons_dict[wel]['name']} lies between the poorest 50% and the richest 10%.", + "unit": "%", + "short_unit": "%", + "numDecimalPlaces": 1, + }, +} + +# Details for each consumption or income variable +inc_cons_dict = { + "income": { + "name": "income", + "name_distribution": "after tax income", + "verb": "received", + "description": "The data relates to income measured after taxes and benefits per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children).", + "processing_description": """To construct a global dataset, the World Bank combines estimates based on income data and estimates based on consumption data. Here we only include the estimates based on income data. + +You can find the data with all available income and consumption data points in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""", + }, + "consumption": { + "name": "consumption", + "name_distribution": "consumption", + "verb": "spent", + "description": "The data relates to consumption per capita. 'Per capita' means that the consumption of each household is attributed equally to each member of the household (including children).", + "processing_description": """To construct a global dataset, the World Bank combines estimates based on income data and estimates based on consumption data. Here we only include the estimates based on consumption data. + +You can find the data with all available income and consumption data points in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""", + }, + "income_consumption": { + "name": "income or consumption", + "name_distribution": "after tax income or consumption", + "verb": "received", + "description": "Depending on the country and year, the data relates to income measured after taxes and benefits, or to consumption, per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children).", + "processing_description": """For a small number of country-year observations, the World Bank PIP data contains two estimates: one based on income data and one based on consumption data. In these cases we keep only the consumption estimate in order to obtain a single series for each country. + +To avoid data misinterpretations, we dropped income estimates for absolute poverty in Poland from 2020 onwards because they differ considerably from the consumption estimates kept from previous years. + +You can find the data with all available income and consumption data points, including these overlapping estimates, in our [complete dataset](https://github.com/owid/poverty-data#a-global-dataset-of-poverty-and-inequality-measures-prepared-by-our-world-in-data-from-the-world-banks-poverty-and-inequality-platform-pip-database) of the World Bank PIP data.""", + }, +} + +# Details for each relative poverty line +rel_dict = {40: "40% of the median", 50: "50% of the median", 60: "60% of the median"} + +# Details for each absolute poverty line +abs_dict = { + 2011: { + 100: {"title": "$1 a day", "title_between": "$1", "description_key": ""}, + 190: { + "title": "$1.90 a day", + "title_between": "$1.90", + "description_key": "Extreme poverty here is defined as living below the International Poverty Line of $1.90 per day.", + }, + 320: { + "title": "$3.20 a day", + "title_between": "$3.20", + "description_key": "A poverty line of $3.20 a day represents definitions of national poverty lines in lower-middle-income countries.", + }, + 550: { + "title": "$5.50 a day", + "title_between": "$5.50", + "description_key": "A poverty line of $5.50 a day represents definitions of national poverty lines in upper-middle-income countries.", + }, + 1000: { + "title": "$10 a day", + "title_between": "$10", + "description_key": "", + }, + 2000: { + "title": "$20 a day", + "title_between": "$20", + "description_key": "", + }, + 3000: { + "title": "$30 a day", + "title_between": "$30", + "description_key": "A poverty line of $30 a day represents definitions of national poverty lines in high-income countries.", + }, + 4000: { + "title": "$40 a day", + "title_between": "$40", + "description_key": "", + }, + }, + 2017: { + 100: {"title": "$1 a day", "title_between": "$1", "description_key": ""}, + 215: { + "title": "$2.15 a day", + "title_between": "$2.15", + "description_key": "Extreme poverty here is defined as living below the International Poverty Line of $2.15 per day.", + }, + 365: { + "title": "$3.65 a day", + "title_between": "$3.65", + "description_key": "A poverty line of $3.65 a day represents definitions of national poverty lines in lower-middle-income countries.", + }, + 685: { + "title": "$6.85 a day", + "title_between": "$6.85", + "description_key": "A poverty line of $6.85 a day represents definitions of national poverty lines in upper-middle-income countries.", + }, + 1000: { + "title": "$10 a day", + "title_between": "$10", + "description_key": "", + }, + 2000: { + "title": "$20 a day", + "title_between": "$20", + "description_key": "", + }, + 3000: { + "title": "$30 a day", + "title_between": "$30", + "description_key": "A poverty line of $30 a day represents definitions of national poverty lines in high-income countries.", + }, + 4000: { + "title": "$40 a day", + "title_between": "$40", + "description_key": "", + }, + }, +} + +# Details for naming each decile/percentile +pct_dict = { + 1: {"decile10": "Poorest decile", "decile9": "Poorest decile"}, + 2: {"decile10": "2nd decile", "decile9": "2nd decile"}, + 3: {"decile10": "3rd decile", "decile9": "3rd decile"}, + 4: {"decile10": "4th decile", "decile9": "4th decile"}, + 5: {"decile10": "5th decile", "decile9": "5th decile"}, + 6: {"decile10": "6th decile", "decile9": "6th decile"}, + 7: {"decile10": "7th decile", "decile9": "7th decile"}, + 8: {"decile10": "8th decile", "decile9": "8th decile"}, + 9: {"decile10": "9th decile", "decile9": "Richest decile"}, + 10: {"decile10": "Richest decile", "decile9": ""}, +} + + +def add_metadata_vars(tb_garden: Table, ppp_version: int, welfare_type: str) -> Table: + """ + Add metadata for each variable in the dataset, using the dictionaries above and the functions below + """ + + # Add short name + tb_garden.metadata.short_name = f"{welfare_type}_{ppp_version}" + + # Create a list from abs_dict + povline_list = list(abs_dict[ppp_version].keys()) + + # Get a list of all the variables available + cols = list(tb_garden.columns) + + for var in var_dict: + # For variables uniquely defined for each country-year-welfare type-reporting level (mostly inequality indicators + mean and median) + col_name = f"{var}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_inequality_mean_median(var, origins, welfare_type) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + .replace("{inc_cons_dict[wel]['name_distribution']}", inc_cons_dict[welfare_type]["name_distribution"]) + .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"]) + ) + + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + for povline in povline_list: + # For variables that use absolute poverty lines + col_name = f"{var}_{povline}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_absolute_povlines( + var, povline, origins, ppp_version, welfare_type + ) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{povline}", abs_dict[ppp_version][povline]["title"]) + .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + ) + + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + # For variables above poverty lines + col_name = f"{var}_above_{povline}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_absolute_povlines( + var, povline, origins, ppp_version, welfare_type + ) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{povline}", abs_dict[ppp_version][povline]["title"]) + .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + ) + + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + # Replace "below" with "above" in the description + tb_garden[col_name].metadata.description_short = tb_garden[col_name].metadata.description_short.replace( + "below", "above" + ) + + # Replace "in poverty" with "not in poverty" in the title + tb_garden[col_name].metadata.title = tb_garden[col_name].metadata.title.replace( + "in poverty", "not in poverty" + ) + + # Replicate the title in the display name and title_public + tb_garden[col_name].metadata.display["name"] = tb_garden[col_name].metadata.title + tb_garden[col_name].metadata.presentation = VariablePresentationMeta( + title_public=tb_garden[col_name].metadata.title + ) + + for i in range(len(povline_list)): + if i != 0: + # For variables between poverty lines + col_name = f"{var}_between_{povline_list[i-1]}_{povline_list[i]}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_between_absolute_povlines( + var, povline_list[i - 1], povline_list[i], origins, ppp_version, welfare_type + ) + + # For variables between poverty lines that jump the original order + col_name = f"{var}_between_{povline_list[1]}_{povline_list[4]}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_between_absolute_povlines( + var, povline_list[1], povline_list[4], origins, ppp_version, welfare_type + ) + + col_name = f"{var}_between_{povline_list[4]}_{povline_list[6]}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_between_absolute_povlines( + var, povline_list[4], povline_list[6], origins, ppp_version, welfare_type + ) + + for rel in rel_dict: + # For variables that use relative poverty lines + col_name = f"{var}_{rel}_median" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_relative_povlines(var, rel, origins, welfare_type) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{povline}", rel_dict[rel]) + .replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + ) + + for pct in pct_dict: + # For variables that use percentiles (deciles) + col_name = f"decile{pct}_{var}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_percentiles(var, pct, origins, ppp_version, welfare_type) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{str(pct)}", f"{str(pct)}0") + .replace( + "{inc_cons_dict[wel]['name_distribution']}", + inc_cons_dict[welfare_type]["name_distribution"], + ) + .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"]) + .replace("{pct_dict[pct]['decile10']}", pct_dict[pct]["decile10"].lower()) + ) + + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + return tb_garden + + +# Metadata functions to show a clearer main code +def var_metadata_inequality_mean_median(var, origins, welfare_type) -> VariableMeta: + """ + Create metadata for defined uniquely by their name + """ + # For monetary variables I include PPP description + if var in ["mean", "median"]: + meta = VariableMeta( + title=f"{var_dict[var]['title']} {inc_cons_dict[welfare_type]['name']}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + else: + meta = VariableMeta( + title=f"{var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +def var_metadata_absolute_povlines(var, povline, origins, ppp_version, welfare_type) -> VariableMeta: + """ + Create metadata for variables with absolute poverty lines + """ + # Define the list of description_key, to then remove the empty ones + description_key_list = [ + abs_dict[ppp_version][povline]["description_key"], + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ] + + # Remove empty strings from the list + description_key_list = list(filter(None, description_key_list)) + + meta = VariableMeta( + title=f"{abs_dict[ppp_version][povline]['title']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=description_key_list, + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +def var_metadata_between_absolute_povlines(var, povline1, povline2, origins, ppp_version, welfare_type) -> VariableMeta: + """ + Create metadata for variables between poverty lines + """ + + meta = VariableMeta( + title=f"{abs_dict[ppp_version][povline1]['title_between']}-{abs_dict[ppp_version][povline2]['title_between']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"].replace("{ppp}", str(ppp_version)), + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + meta.description_short = meta.description_short.replace( + "{povline}", + f"living between {abs_dict[ppp_version][povline1]['title_between']} and {abs_dict[ppp_version][povline2]['title_between']} a day", + ).replace("{inc_cons_dict[wel]['name']}", inc_cons_dict[welfare_type]["name"]) + + meta.description_key = [ppp.replace("{ppp}", str(ppp_version)) for ppp in meta.description_key] + + meta.unit = meta.unit.replace("{ppp}", str(ppp_version)) + + return meta + + +def var_metadata_relative_povlines(var, rel, origins, welfare_type) -> VariableMeta: + """ + Create metadata for variables with relative poverty lines + """ + + meta = VariableMeta( + title=f"{rel_dict[rel]} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + relative_poverty_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{processing_description_relative_poverty} + +{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +def var_metadata_percentiles(var, pct, origins, ppp_version, welfare_type) -> VariableMeta: + """ + Create metadata for variables with percentiles + """ + + if var == "thr": + meta = VariableMeta( + title=f"{pct_dict[pct]['decile9']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{processing_description_thr} + +{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + elif var == "avg": + meta = VariableMeta( + title=f"{pct_dict[pct]['decile10']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{processing_description_avg} + +{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + # For shares + else: + meta = VariableMeta( + title=f"{pct_dict[pct]['decile10']} - {var_dict[var]['title']}", + description_short=var_dict[var]["description"], + description_key=[ + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{inc_cons_dict[welfare_type]['processing_description']}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta + + +# FOR PERCENTILES +def add_metadata_vars_percentiles(tb_garden: Table, ppp_version: int, welfare_type: str) -> Table: + """ + Add metadata for each variable in the dataset, using the dictionaries above and the functions below + This is done for the percentile tables + """ + + # Add short name + tb_garden.metadata.short_name = f"percentiles_{welfare_type}_{ppp_version}" + + # Get a list of all the variables available + cols = list(tb_garden.columns) + + for var in var_dict: + # For variables uniquely defined for each country-year-welfare type-reporting level (mostly inequality indicators + mean and median) + col_name = f"{var}" + + if col_name in cols: + # Get the origins of the variable + origins = tb_garden[col_name].metadata.origins + + # Create metadata for these variables + tb_garden[col_name].metadata = var_metadata_percentile_table(var, origins, welfare_type) + + # Replace placeholders + tb_garden[col_name].metadata.description_short = ( + tb_garden[col_name] + .metadata.description_short.replace("{str(pct)}", "each 1") + .replace( + "{inc_cons_dict[wel]['name_distribution']}", + inc_cons_dict[welfare_type]["name_distribution"], + ) + .replace("{inc_cons_dict[wel]['verb']}", inc_cons_dict[welfare_type]["verb"]) + .replace( + "the {pct_dict[pct]['decile10']} (tenth of the population)", + "each percentile (hundredth of the population)", + ) + ) + tb_garden[col_name].metadata.description_key = [ + ppp.replace("{ppp}", str(ppp_version)) for ppp in tb_garden[col_name].metadata.description_key + ] + + tb_garden[col_name].metadata.unit = tb_garden[col_name].metadata.unit.replace("{ppp}", str(ppp_version)) + + return tb_garden + + +def var_metadata_percentile_table(var, origins, welfare_type) -> VariableMeta: + """ + Create metadata for variables with percentiles + """ + + if var == "thr": + meta = VariableMeta( + title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing=f"""{processing_description_thr_percentiles}""", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + elif var == "avg": + meta = VariableMeta( + title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}", + description_short=var_dict[var]["description"], + description_key=[ + ppp_description, + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing="", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + # For shares + else: + meta = VariableMeta( + title=f"{inc_cons_dict[welfare_type]['name'].capitalize()} {var_dict[var]['title'].lower()}", + description_short=var_dict[var]["description"], + description_key=[ + inc_cons_dict[welfare_type]["description"], + non_market_income_description, + ], + description_processing="", + unit=var_dict[var]["unit"], + short_unit=var_dict[var]["short_unit"], + origins=origins, + ) + + meta.display = { + "name": meta.title, + "numDecimalPlaces": var_dict[var]["numDecimalPlaces"], + "tolerance": TOLERANCE, + } + + meta.presentation = VariablePresentationMeta(title_public=meta.title) + + return meta diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json new file mode 100644 index 00000000000..73342a8a395 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.countries.json @@ -0,0 +1,181 @@ +{ + "Albania": "Albania", + "Algeria": "Algeria", + "Angola": "Angola", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo, Dem. Rep.": "Democratic Republic of Congo", + "Congo, Rep.": "Congo", + "Costa Rica": "Costa Rica", + "Cote d'Ivoire": "Cote d'Ivoire", + "Croatia": "Croatia", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt, Arab Rep.": "Egypt", + "El Salvador": "El Salvador", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia, The": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran, Islamic Rep.": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea, Rep.": "South Korea", + "Kosovo": "Kosovo", + "Kyrgyz Republic": "Kyrgyzstan", + "Lao PDR": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia, Fed. Sts.": "Micronesia (country)", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Pakistan": "Pakistan", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Romania": "Romania", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Samoa": "Samoa", + "Sao Tome and Principe": "Sao Tome and Principe", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Slovak Republic": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "South Africa": "South Africa", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "St. Lucia": "Saint Lucia", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syrian Arab Republic": "Syria", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkmenistan": "Turkmenistan", + "Tuvalu": "Tuvalu", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela, RB": "Venezuela", + "Viet Nam": "Vietnam", + "West Bank and Gaza": "Palestine", + "World": "World", + "Yemen, Rep.": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "East Asia & Pacific": "East Asia and Pacific (PIP)", + "Eastern and Southern Africa": "Eastern and Southern Africa (PIP)", + "Europe & Central Asia": "Europe and Central Asia (PIP)", + "Latin America & Caribbean": "Latin America and the Caribbean (PIP)", + "Middle East & North Africa": "Middle East and North Africa (PIP)", + "Other High Income Countries": "Other high income countries (PIP)", + "South Asia": "South Asia (PIP)", + "Sub-Saharan Africa": "Sub-Saharan Africa (PIP)", + "Taiwan, China": "Taiwan", + "Turkiye": "Turkey", + "Western and Central Africa": "Western and Central Africa (PIP)" +} \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml new file mode 100644 index 00000000000..a6eba01d529 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.meta.yml @@ -0,0 +1,473 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Poverty + - Economic Inequality + - Economic Growth + attribution_short: World Bank + grapher_config: + originUrl: https://ourworldindata.org/poverty + $schema: https://files.ourworldindata.org/schemas/grapher-schema.003.json + processing_level: major + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + update_period_days: 180 + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/tables/ +tables: + income_consumption_2017: + # Learn more about the available fields: + # http://docs.owid.io/projects/etl/architecture/metadata/reference/indicator/ + variables: + headcount_ratio_215: + presentation: + title_public: Share of population living in extreme poverty + faqs: + - fragment_id: poverty-international-poverty-line + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: Share of population living in extreme poverty + subtitle: >- + Extreme poverty is defined as living below the International Poverty Line of + $2.15 per day. This data is adjusted for inflation and for differences in the + cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at + 2017 prices. Depending on the country and year, it relates to income measured + after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + variantName: Line chart + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 3 + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 70 + - 80 + - 90 + - 100 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_365: + presentation: + title_public: "Poverty: Share of population living on less than $3.65 a day" + faqs: + - fragment_id: poverty-international-poverty-line + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: "Poverty: Share of population living on less than $3.65 a day" + subtitle: >- + The poverty line of $3.65 per day is set by the World Bank to be representative of the definitions of poverty adopted in lower-middle-income countries. This data is adjusted for inflation and for differences in the cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at + 2017 prices. Depending on the country and year, it relates to income measured + after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + variantName: Line chart + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 70 + - 80 + - 90 + - 100 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_685: + presentation: + title_public: "Poverty: Share of population living on less than $6.85 a day" + faqs: + - fragment_id: poverty-international-poverty-line + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: "Poverty: Share of population living on less than $6.85 a day" + subtitle: >- + The poverty line of $6.85 per day is set by the World Bank to be representative of the definitions of poverty adopted in upper-middle-income countries. This data is adjusted for inflation and for differences in the cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at + 2017 prices. Depending on the country and year, it relates to income measured + after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + variantName: Line chart + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 70 + - 80 + - 90 + - 100 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + + headcount_ratio_3000: + presentation: + title_public: Share of population living on less than $30 a day + faqs: + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: "Poverty: Share of population living on less than $30 a day" + subtitle: >- + This data is adjusted for inflation and for differences in the cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at 2017 prices. Depending on the country and year, it relates to income measured after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + variantName: Line chart + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: OrRd + binningStrategy: manual + customNumericValues: + - 10 + - 20 + - 30 + - 40 + - 50 + - 60 + - 70 + - 80 + - 90 + - 100 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_60_median: + presentation: + title_public: Share of population below 60% of median income or consumption + topic_tags: + - Poverty + - Economic Inequality + faqs: + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: 'Relative poverty: Share of people below 60% of median income' + subtitle: >- + Relative poverty is measured in terms of a poverty line that rises and falls + over time with average incomes — in this case set at 60% of median income. + note: >- + Depending on the country and year, the data relates to income measured after + taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + yAxis: + min: 0 + colorScale: + baseColorScheme: OwidDistinctLines + map: + time: 2019 + colorScale: + baseColorScheme: YlOrBr + binningStrategy: manual + customNumericValues: + - 5 + - 10 + - 15 + - 20 + - 25 + - 30 + - 35 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_50_median: + presentation: + title_public: Share of population below 50% of median income or consumption + topic_tags: + - Poverty + - Economic Inequality + faqs: + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: 'Relative poverty: Share of people below 50% of median income' + subtitle: Relative poverty is measured in terms of a poverty line that rises and falls over time with average incomes – in this case set at 50% of median income. + note: >- + Depending on the country and year, the data relates to income measured after + taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + yAxis: + min: 0 + colorScale: + baseColorScheme: OwidDistinctLines + map: + time: 2019 + colorScale: + baseColorScheme: YlOrBr + binningStrategy: manual + customNumericValues: + - 3 + - 6 + - 9 + - 12 + - 15 + - 18 + - 21 + - 24 + - 27 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + headcount_ratio_40_median: + presentation: + title_public: Share of population below 40% of median income or consumption + topic_tags: + - Poverty + - Economic Inequality + faqs: + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: 'Relative poverty: Share of people below 40% of median income' + subtitle: Relative poverty is measured in terms of a poverty line that rises and falls over time with average incomes – in this case set at 40% of median income. + note: >- + Depending on the country and year, the data relates to income measured after + taxes and benefits, or to consumption, [per capita](#dod:per-capita). + hasMapTab: true + tab: map + yAxis: + min: 0 + colorScale: + baseColorScheme: OwidDistinctLines + map: + time: 2019 + colorScale: + baseColorScheme: YlOrBr + binningStrategy: manual + customNumericValues: + - 2 + - 4 + - 6 + - 8 + - 10 + - 12 + - 14 + - 16 + - 18 + - 20 + selectedEntityNames: + - Bangladesh + - Bolivia + - Madagascar + - India + - China + - Ethiopia + + gini: + presentation: + title_public: Gini Coefficient + topic_tags: + - Poverty + - Economic Inequality + faqs: + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: 'Income inequality: Gini coefficient' + subtitle: >- + The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. + Higher values indicate higher inequality. Depending on the country and year, + the data relates to income measured after taxes and benefits, or to + consumption, [per capita](#dod:per-capita). + note: >- + Income and consumption estimates are available separately in this [Data + Explorer](https://ourworldindata.org/explorers/pip-inequality-explorer). + hasMapTab: true + tab: map + variantName: World Bank + originUrl: https://ourworldindata.org/economic-inequality + yAxis: + min: 0 + map: + time: 2019 + colorScale: + baseColorScheme: Oranges + binningStrategy: manual + customNumericMinValue: 1 + customNumericValues: + - 0.3 + - 0.35 + - 0.4 + - 0.45 + - 0.5 + - 0.55 + - 0.6 + selectedEntityNames: + - Chile + - Brazil + - South Africa + - United States + - France + - China + + headcount_215_regions: + title: $2.15 a day - Number in poverty (Regional aggregates) + unit: "people" + short_unit: "" + description_short: Number of people in households with an income or consumption per person below $2.15 a day. + description_key: + - Extreme poverty here is defined as living below the International Poverty Line of $2.15 per day. + - The data is measured in international-$ at 2017 prices – this adjusts for inflation and for differences in the cost of living between countries. + - Depending on the country and year, the data relates to income measured after taxes and benefits, or to consumption, per capita. 'Per capita' means that the income of each household is attributed equally to each member of the household (including children). + - Non-market sources of income, including food grown by subsistence farmers for their own consumption, are taken into account. + description_processing: |- + PIP provides regional aggregate figures for the number of people living below the International Poverty Line. Unfortunately, for certain regions and years the data survey coverage is too low and the results are suppressed. From 1990 onwards, it is only for South Asia and Sub-Saharan Africa (on different years) that regional estimates are sometimes missing. + + For these years we calculate the number of poor in the region as the difference between the estimated total number of poor across the world and the sum of the number of poor across all other regions. + + Prior to 1990 estimates for more than one region are missing, precluding this method. + display: + numDecimalPlaces: 0 + presentation: + title_public: Total of population living in extreme poverty by world region + topic_tags: + - Poverty + - Economic Growth + - Economic Inequality + faqs: + - fragment_id: poverty-international-poverty-line + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-international-dollars + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-comparability + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: poverty-regional-estimates + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + title: Total population living in extreme poverty by world region + subtitle: >- + Extreme poverty is defined as living below the International Poverty Line of + $2.15 per day. This data is adjusted for inflation and for differences in the + cost of living between countries. + note: >- + This data is expressed in [international-$](#dod:int_dollar_abbreviation) at + 2017 prices. Depending on the country and year, it relates to income measured + after taxes and benefits, or to consumption, [per capita](#dod:per-capita). + type: StackedArea + addCountryMode: disabled + hideRelativeToggle: false + originUrl: https://ourworldindata.org/poverty + baseColorScheme: OwidCategoricalC + invertColorScheme: true + yAxis: + min: 0 + selectedEntityNames: + - Other high income countries (PIP) + - Latin America and the Caribbean (PIP) + - East Asia and Pacific (PIP) + - South Asia (PIP) + - Middle East and North Africa (PIP) + - Europe and Central Asia (PIP) + - Sub-Saharan Africa (PIP) + + surveys_past_decade: + title: Number of surveys in the past decade + unit: "surveys" + short_unit: "" + description_short: The number of income or consumption surveys available in the past decade. Each decade comprises the current year and the nine years before. + description_processing: |- + For a small number of country-year observations, the World Bank PIP data contains two estimates: one based on income data and one based on consumption data. In these cases we keep only the consumption estimate in order to obtain a single series for each country. This means the indicator is estimating the number of years at least one survey was conducted in the past decade, rather than the number of surveys. + display: + numDecimalPlaces: 0 \ No newline at end of file diff --git a/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py new file mode 100644 index 00000000000..65564efb8b7 --- /dev/null +++ b/etl/steps/data/garden/wb/2024-03-27/world_bank_pip.py @@ -0,0 +1,1203 @@ +""" +Load a meadow dataset and create a garden dataset. + +When running this step in an update, be sure to check all the outputs and logs to ensure the data is correct. +Also check the manual fix of Polish data to avoid weird drop in the income/consumption levels from 2020 onwards. +(Remove metadata about this when the fix is no longer needed in inc_cons_dict["income_consumption"]["processing_description"], shared.py script) + +NOTE: To extract the log of the process (to review sanity checks, for example), run the following command in the terminal: + nohup poetry run etl run world_bank_pip > output.log 2>&1 & +""" + +from typing import Tuple + +import numpy as np +import owid.catalog.processing as pr +from owid.catalog import Table +from shared import add_metadata_vars, add_metadata_vars_percentiles +from structlog import get_logger +from tabulate import tabulate + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Initialize logger. +log = get_logger() + +# Define absolute poverty lines used depending on PPP version +# NOTE: Modify if poverty lines are updated from source +povlines_dict = { + 2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000], + 2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000], +} + +# Define regions in the dataset +regions_list = [ + "East Asia and Pacific (PIP)", + "Eastern and Southern Africa (PIP)", + "Europe and Central Asia (PIP)", + "Latin America and the Caribbean (PIP)", + "Middle East and North Africa (PIP)", + "Other high income countries (PIP)", + "South Asia (PIP)", + "Sub-Saharan Africa (PIP)", + "Western and Central Africa (PIP)", + "World", +] + +# Set table format when printing +TABLEFMT = "pretty" + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("world_bank_pip") + + # Read tables from meadow dataset. + # Key indicators + tb = ds_meadow["world_bank_pip"].reset_index() + + # Percentiles + tb_percentiles = ds_meadow["world_bank_pip_percentiles"].reset_index() + + # Process data + # Make table wide and change column names + tb = process_data(tb) + + # Calculate inequality measures + tb = calculate_inequality(tb) + + # Harmonize country names + tb: Table = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb_percentiles: Table = geo.harmonize_countries(df=tb_percentiles, countries_file=paths.country_mapping_path) + + # Show regional data from 1990 onwards + tb = regional_data_from_1990(tb, regions_list) + tb_percentiles = regional_data_from_1990(tb_percentiles, regions_list) + + # Amend the entity to reflect if data refers to urban or rural only + tb = identify_rural_urban(tb) + + # Separate out ppp and filled data from the main dataset + tb_2011, tb_2017 = separate_ppp_data(tb) + tb_percentiles_2011, tb_percentiles_2017 = separate_ppp_data(tb_percentiles) + + # Create stacked variables from headcount and headcount_ratio + tb_2011, col_stacked_n_2011, col_stacked_pct_2011 = create_stacked_variables( + tb_2011, povlines_dict, ppp_version=2011 + ) + tb_2017, col_stacked_n_2017, col_stacked_pct_2017 = create_stacked_variables( + tb_2017, povlines_dict, ppp_version=2017 + ) + + # Sanity checks. I don't run for percentile tables because that process was done in the extraction + tb_2011 = sanity_checks( + tb_2011, povlines_dict, ppp_version=2011, col_stacked_n=col_stacked_n_2011, col_stacked_pct=col_stacked_pct_2011 + ) + tb_2017 = sanity_checks( + tb_2017, povlines_dict, ppp_version=2017, col_stacked_n=col_stacked_n_2017, col_stacked_pct=col_stacked_pct_2017 + ) + + # Separate out consumption-only, income-only. Also, create a table with both income and consumption + tb_inc_2011, tb_cons_2011, tb_inc_or_cons_2011 = inc_or_cons_data(tb_2011) + tb_inc_2017, tb_cons_2017, tb_inc_or_cons_2017 = inc_or_cons_data(tb_2017) + + # Create regional headcount variable, by patching missing values with the difference between world and regional headcount + tb_inc_or_cons_2017 = regional_headcount(tb_inc_or_cons_2017) + + # Create survey count dataset, by counting the number of surveys available for each country in the past decade + tb_inc_or_cons_2017 = survey_count(tb_inc_or_cons_2017) + + # Add metadata by code + tb_inc_2011 = add_metadata_vars(tb_garden=tb_inc_2011, ppp_version=2011, welfare_type="income") + tb_cons_2011 = add_metadata_vars(tb_garden=tb_cons_2011, ppp_version=2011, welfare_type="consumption") + tb_inc_or_cons_2011 = add_metadata_vars( + tb_garden=tb_inc_or_cons_2011, + ppp_version=2011, + welfare_type="income_consumption", + ) + + tb_inc_2017 = add_metadata_vars(tb_garden=tb_inc_2017, ppp_version=2017, welfare_type="income") + tb_cons_2017 = add_metadata_vars(tb_garden=tb_cons_2017, ppp_version=2017, welfare_type="consumption") + tb_inc_or_cons_2017 = add_metadata_vars( + tb_garden=tb_inc_or_cons_2017, + ppp_version=2017, + welfare_type="income_consumption", + ) + + tb_percentiles_2011 = add_metadata_vars_percentiles( + tb_garden=tb_percentiles_2011, + ppp_version=2011, + welfare_type="income_consumption", + ) + tb_percentiles_2017 = add_metadata_vars_percentiles( + tb_garden=tb_percentiles_2017, + ppp_version=2017, + welfare_type="income_consumption", + ) + + # Set index and sort + # Define index cols + index_cols = ["country", "year"] + index_cols_percentiles = ["country", "year", "reporting_level", "welfare_type", "percentile"] + tb_inc_2011 = set_index_and_sort(tb=tb_inc_2011, index_cols=index_cols) + tb_cons_2011 = set_index_and_sort(tb=tb_cons_2011, index_cols=index_cols) + tb_inc_or_cons_2011 = set_index_and_sort(tb=tb_inc_or_cons_2011, index_cols=index_cols) + + tb_inc_2017 = set_index_and_sort(tb=tb_inc_2017, index_cols=index_cols) + tb_cons_2017 = set_index_and_sort(tb=tb_cons_2017, index_cols=index_cols) + tb_inc_or_cons_2017 = set_index_and_sort(tb=tb_inc_or_cons_2017, index_cols=index_cols) + + tb_percentiles_2011 = set_index_and_sort(tb=tb_percentiles_2011, index_cols=index_cols_percentiles) + tb_percentiles_2017 = set_index_and_sort(tb=tb_percentiles_2017, index_cols=index_cols_percentiles) + + # Create spell tables to separate different survey spells in the explorers + spell_tables_inc = create_survey_spells(tb=tb_inc_2017) + spell_tables_cons = create_survey_spells(tb=tb_cons_2017) + + # For income and consumption we combine the tables to not lose information from tb_inc_or_cons_2017 + spell_tables_inc_or_cons = create_survey_spells_inc_cons(tb_inc=tb_inc_2017, tb_cons=tb_cons_2017) + + # Drop columns not needed + tb_inc_2011 = drop_columns(tb_inc_2011) + tb_cons_2011 = drop_columns(tb_cons_2011) + tb_inc_or_cons_2011 = drop_columns(tb_inc_or_cons_2011) + + tb_inc_2017 = drop_columns(tb_inc_2017) + tb_cons_2017 = drop_columns(tb_cons_2017) + tb_inc_or_cons_2017 = drop_columns(tb_inc_or_cons_2017) + + # Merge tables for PPP comparison explorer + tb_inc_2011_2017 = combine_tables_2011_2017(tb_2011=tb_inc_2011, tb_2017=tb_inc_2017, short_name="income_2011_2017") + tb_cons_2011_2017 = combine_tables_2011_2017( + tb_2011=tb_cons_2011, tb_2017=tb_cons_2017, short_name="consumption_2011_2017" + ) + tb_inc_or_cons_2011_2017 = combine_tables_2011_2017( + tb_2011=tb_inc_or_cons_2011, tb_2017=tb_inc_or_cons_2017, short_name="income_consumption_2011_2017" + ) + + # Define tables to upload + # The ones we need in Grapher admin would be tb_inc_or_cons_2011, tb_inc_or_cons_2017 + tables = ( + [ + tb_inc_2011, + tb_cons_2011, + tb_inc_or_cons_2011, + tb_inc_2017, + tb_cons_2017, + tb_inc_or_cons_2017, + tb_inc_2011_2017, + tb_cons_2011_2017, + tb_inc_or_cons_2011_2017, + tb_percentiles_2011, + tb_percentiles_2017, + ] + + spell_tables_inc + + spell_tables_cons + + spell_tables_inc_or_cons + ) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def process_data(tb: Table) -> Table: + # rename columns + tb = tb.rename(columns={"headcount": "headcount_ratio", "poverty_gap": "poverty_gap_index"}) + + # Changing the decile(i) variables for decile(i)_share + for i in range(1, 11): + tb = tb.rename(columns={f"decile{i}": f"decile{i}_share"}) + + # Calculate number in poverty + tb["headcount"] = tb["headcount_ratio"] * tb["reporting_pop"] + tb["headcount"] = tb["headcount"].round(0) + + # Calculate shortfall of incomes + tb["total_shortfall"] = tb["poverty_gap_index"] * tb["poverty_line"] * tb["reporting_pop"] + + # Calculate average shortfall of incomes (averaged across population in poverty) + tb["avg_shortfall"] = tb["total_shortfall"] / tb["headcount"] + + # Calculate income gap ratio (according to Ravallion's definition) + tb["income_gap_ratio"] = (tb["total_shortfall"] / tb["headcount"]) / tb["poverty_line"] + + # Same for relative poverty + for pct in [40, 50, 60]: + tb[f"headcount_{pct}_median"] = tb[f"headcount_ratio_{pct}_median"] * tb["reporting_pop"] + tb[f"headcount_{pct}_median"] = tb[f"headcount_{pct}_median"].round(0) + tb[f"total_shortfall_{pct}_median"] = ( + tb[f"poverty_gap_index_{pct}_median"] * tb["median"] * pct / 100 * tb["reporting_pop"] + ) + tb[f"avg_shortfall_{pct}_median"] = tb[f"total_shortfall_{pct}_median"] / tb[f"headcount_{pct}_median"] + tb[f"income_gap_ratio_{pct}_median"] = (tb[f"total_shortfall_{pct}_median"] / tb[f"headcount_{pct}_median"]) / ( + tb["median"] * pct / 100 + ) + + # Shares to percentages + # executing the function over list of vars + pct_indicators = [ + "headcount_ratio", + "income_gap_ratio", + "poverty_gap_index", + "headcount_ratio_40_median", + "headcount_ratio_50_median", + "headcount_ratio_60_median", + "income_gap_ratio_40_median", + "income_gap_ratio_50_median", + "income_gap_ratio_60_median", + "poverty_gap_index_40_median", + "poverty_gap_index_50_median", + "poverty_gap_index_60_median", + ] + tb.loc[:, pct_indicators] = tb[pct_indicators] * 100 + + # Create a new column for the poverty line in cents and string + tb["poverty_line_cents"] = round(tb["poverty_line"] * 100).astype(int).astype(str) + + # Make the table wide, with poverty_line_cents as columns + tb = tb.pivot( + index=[ + "ppp_version", + "country", + "year", + "reporting_level", + "welfare_type", + "survey_comparability", + "comparable_spell", + "reporting_pop", + "mean", + "median", + "mld", + "gini", + "polarization", + "decile1_share", + "decile2_share", + "decile3_share", + "decile4_share", + "decile5_share", + "decile6_share", + "decile7_share", + "decile8_share", + "decile9_share", + "decile10_share", + "decile1_thr", + "decile2_thr", + "decile3_thr", + "decile4_thr", + "decile5_thr", + "decile6_thr", + "decile7_thr", + "decile8_thr", + "decile9_thr", + "is_interpolated", + "distribution_type", + "estimation_type", + "headcount_40_median", + "headcount_50_median", + "headcount_60_median", + "headcount_ratio_40_median", + "headcount_ratio_50_median", + "headcount_ratio_60_median", + "income_gap_ratio_40_median", + "income_gap_ratio_50_median", + "income_gap_ratio_60_median", + "poverty_gap_index_40_median", + "poverty_gap_index_50_median", + "poverty_gap_index_60_median", + "avg_shortfall_40_median", + "avg_shortfall_50_median", + "avg_shortfall_60_median", + "total_shortfall_40_median", + "total_shortfall_50_median", + "total_shortfall_60_median", + "poverty_severity_40_median", + "poverty_severity_50_median", + "poverty_severity_60_median", + "watts_40_median", + "watts_50_median", + "watts_60_median", + ], + columns="poverty_line_cents", + values=[ + "headcount", + "headcount_ratio", + "income_gap_ratio", + "poverty_gap_index", + "avg_shortfall", + "total_shortfall", + "poverty_severity", + "watts", + ], + ) + + # Flatten column names + tb.columns = ["_".join(col).strip() for col in tb.columns.values] + + # Reset index + tb = tb.reset_index() + + return tb + + +def create_stacked_variables(tb: Table, povlines_dict: dict, ppp_version: int) -> Tuple[Table, list, list]: + """ + Create stacked variables from the indicators to plot them as stacked area/bar charts + """ + # Select poverty lines between 2011 and 2017 and sort in case they are not in order + povlines = povlines_dict[ppp_version] + povlines.sort() + + # Above variables + + col_above_n = [] + col_above_pct = [] + + for p in povlines: + varname_n = f"headcount_above_{p}" + varname_pct = f"headcount_ratio_above_{p}" + + tb[varname_n] = tb["reporting_pop"] - tb[f"headcount_{p}"] + tb[varname_pct] = tb[varname_n] / tb["reporting_pop"] + + col_above_n.append(varname_n) + col_above_pct.append(varname_pct) + + tb.loc[:, col_above_pct] = tb[col_above_pct] * 100 + + # Stacked variables + + col_stacked_n = [] + col_stacked_pct = [] + + for i in range(len(povlines)): + # if it's the first value only continue + if i == 0: + continue + + # If it's the last value calculate the people between this value and the previous + # and also the people over this poverty line (and percentages) + elif i == len(povlines) - 1: + varname_n = f"headcount_between_{povlines[i-1]}_{povlines[i]}" + varname_pct = f"headcount_ratio_between_{povlines[i-1]}_{povlines[i]}" + tb[varname_n] = tb[f"headcount_{povlines[i]}"] - tb[f"headcount_{povlines[i-1]}"] + tb[varname_pct] = tb[varname_n] / tb["reporting_pop"] + col_stacked_n.append(varname_n) + col_stacked_pct.append(varname_pct) + varname_n = f"headcount_above_{povlines[i]}" + varname_pct = f"headcount_ratio_above_{povlines[i]}" + tb[varname_n] = tb["reporting_pop"] - tb[f"headcount_{povlines[i]}"] + tb[varname_pct] = tb[varname_n] / tb["reporting_pop"] + col_stacked_n.append(varname_n) + col_stacked_pct.append(varname_pct) + + # If it's any value between the first and the last calculate the people between this value and the previous (and percentage) + else: + varname_n = f"headcount_between_{povlines[i-1]}_{povlines[i]}" + varname_pct = f"headcount_ratio_between_{povlines[i-1]}_{povlines[i]}" + tb[varname_n] = tb[f"headcount_{povlines[i]}"] - tb[f"headcount_{povlines[i-1]}"] + tb[varname_pct] = tb[varname_n] / tb["reporting_pop"] + col_stacked_n.append(varname_n) + col_stacked_pct.append(varname_pct) + + tb.loc[:, col_stacked_pct] = tb[col_stacked_pct] * 100 + + # Add variables below first poverty line to the stacked variables + col_stacked_n.append(f"headcount_{povlines[0]}") + col_stacked_pct.append(f"headcount_ratio_{povlines[0]}") + + # Calculate stacked variables which "jump" the original order + + tb[f"headcount_between_{povlines[1]}_{povlines[4]}"] = ( + tb[f"headcount_{povlines[4]}"] - tb[f"headcount_{povlines[1]}"] + ) + tb[f"headcount_between_{povlines[4]}_{povlines[6]}"] = ( + tb[f"headcount_{povlines[6]}"] - tb[f"headcount_{povlines[4]}"] + ) + + tb[f"headcount_ratio_between_{povlines[1]}_{povlines[4]}"] = ( + tb[f"headcount_ratio_{povlines[4]}"] - tb[f"headcount_ratio_{povlines[1]}"] + ) + tb[f"headcount_ratio_between_{povlines[4]}_{povlines[6]}"] = ( + tb[f"headcount_ratio_{povlines[6]}"] - tb[f"headcount_ratio_{povlines[4]}"] + ) + + return tb, col_stacked_n, col_stacked_pct + + +def calculate_inequality(tb: Table) -> Table: + """ + Calculate inequality measures: decile averages and ratios + """ + + col_decile_share = [] + col_decile_avg = [] + col_decile_thr = [] + + for i in range(1, 11): + if i != 10: + varname_thr = f"decile{i}_thr" + col_decile_thr.append(varname_thr) + + varname_share = f"decile{i}_share" + varname_avg = f"decile{i}_avg" + tb[varname_avg] = tb[varname_share] * tb["mean"] / 0.1 + + col_decile_share.append(varname_share) + col_decile_avg.append(varname_avg) + + # Multiplies decile columns by 100 + tb.loc[:, col_decile_share] = tb[col_decile_share] * 100 + + # Create bottom 50 and middle 40% shares + tb["bottom50_share"] = ( + tb["decile1_share"] + tb["decile2_share"] + tb["decile3_share"] + tb["decile4_share"] + tb["decile5_share"] + ) + tb["middle40_share"] = tb["decile6_share"] + tb["decile7_share"] + tb["decile8_share"] + tb["decile9_share"] + + # Palma ratio and other average/share ratios + tb["palma_ratio"] = tb["decile10_share"] / ( + tb["decile1_share"] + tb["decile2_share"] + tb["decile3_share"] + tb["decile4_share"] + ) + tb["s80_s20_ratio"] = (tb["decile9_share"] + tb["decile10_share"]) / (tb["decile1_share"] + tb["decile2_share"]) + tb["p90_p10_ratio"] = tb["decile9_thr"] / tb["decile1_thr"] + tb["p90_p50_ratio"] = tb["decile9_thr"] / tb["decile5_thr"] + tb["p50_p10_ratio"] = tb["decile5_thr"] / tb["decile1_thr"] + + # Replace infinite values with nulls + tb = tb.replace([np.inf, -np.inf], np.nan) + return tb + + +def identify_rural_urban(tb: Table) -> Table: + """ + Amend the entity to reflect if data refers to urban or rural only + """ + + # Make country and reporting_level columns into strings + tb["country"] = tb["country"].astype(str) + tb["reporting_level"] = tb["reporting_level"].astype(str) + ix = tb["reporting_level"].isin(["urban", "rural"]) + tb.loc[(ix), "country"] = tb.loc[(ix), "country"] + " (" + tb.loc[(ix), "reporting_level"] + ")" + + return tb + + +def sanity_checks( + tb: Table, povlines_dict: dict, ppp_version: int, col_stacked_n: list, col_stacked_pct: list +) -> Table: + """ + Sanity checks for the table + """ + + # Select poverty lines between 2011 and 2017 and sort in case they are not in order + povlines = povlines_dict[ppp_version] + povlines.sort() + + # Save the number of observations before the checks + obs_before_checks = len(tb) + + # Create lists of variables to check + col_headcount = [] + col_headcount_ratio = [] + col_povertygap = [] + col_tot_shortfall = [] + col_watts = [] + col_poverty_severity = [] + col_decile_share = [] + col_decile_thr = [] + + for p in povlines: + col_headcount.append(f"headcount_{p}") + col_headcount_ratio.append(f"headcount_ratio_{p}") + col_povertygap.append(f"poverty_gap_index_{p}") + col_tot_shortfall.append(f"total_shortfall_{p}") + col_watts.append(f"watts_{p}") + col_poverty_severity.append(f"poverty_severity_{p}") + + for i in range(1, 11): + col_decile_share.append(f"decile{i}_share") + if i != 10: + col_decile_thr.append(f"decile{i}_thr") + + ############################ + # Negative values + mask = ( + tb[ + col_headcount + + col_headcount_ratio + + col_povertygap + + col_tot_shortfall + + col_watts + + col_poverty_severity + + col_decile_share + + col_decile_thr + + ["mean", "median", "mld", "gini", "polarization"] + ] + .lt(0) + .any(axis=1) + ) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.fatal( + f"""There are {len(tb_error)} observations with negative values! In + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type']], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + # NOTE: Check if we want to delete these observations + # tb = tb[~mask].reset_index(drop=True) + + ############################ + # stacked values not adding up to 100% + tb["sum_pct"] = tb[col_stacked_pct].sum(axis=1) + mask = (tb["sum_pct"] >= 100.1) | (tb["sum_pct"] <= 99.9) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""{len(tb_error)} observations of stacked values are not adding up to 100% and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type', 'sum_pct']], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + tb = tb[~mask].reset_index(drop=True).copy() + + ############################ + # missing poverty values (headcount, poverty gap, total shortfall) + cols_to_check = ( + col_headcount + col_headcount_ratio + col_povertygap + col_tot_shortfall + col_stacked_n + col_stacked_pct + ) + mask = tb[cols_to_check].isna().any(axis=1) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with missing poverty values and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_headcount], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + tb = tb[~mask].reset_index(drop=True) + + ############################ + # Missing median + mask = tb["median"].isna() + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info(f"""There are {len(tb_error)} observations with missing median. They will be not deleted.""") + + ############################ + # Missing mean + mask = tb["mean"].isna() + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info(f"""There are {len(tb_error)} observations with missing mean. They will be not deleted.""") + + ############################ + # Missing gini + mask = tb["gini"].isna() + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info(f"""There are {len(tb_error)} observations with missing gini. They will be not deleted.""") + + ############################ + # Missing decile shares + mask = tb[col_decile_share].isna().any(axis=1) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info(f"""There are {len(tb_error)} observations with missing decile shares. They will be not deleted.""") + + ############################ + # Missing decile thresholds + mask = tb[col_decile_thr].isna().any(axis=1) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.info( + f"""There are {len(tb_error)} observations with missing decile thresholds. They will be not deleted.""" + ) + + ############################ + # headcount monotonicity check + m_check_vars = [] + for i in range(len(col_headcount)): + if i > 0: + check_varname = f"m_check_{i}" + tb[check_varname] = tb[f"{col_headcount[i]}"] >= tb[f"{col_headcount[i-1]}"] + m_check_vars.append(check_varname) + tb["check_total"] = tb[m_check_vars].all(axis=1) + + tb_error = tb[~tb["check_total"]].reset_index(drop=True) + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with headcount not monotonically increasing and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_headcount], headers = 'keys', tablefmt = TABLEFMT, floatfmt="0.0f")}""" + ) + tb = tb[tb["check_total"]].reset_index(drop=True) + + ############################ + # Threshold monotonicity check + m_check_vars = [] + for i in range(1, 10): + if i > 1: + check_varname = f"m_check_{i}" + tb[check_varname] = tb[f"decile{i}_thr"] >= tb[f"decile{i-1}_thr"] + m_check_vars.append(check_varname) + + tb["check_total"] = tb[m_check_vars].all(axis=1) + + # Drop rows if columns in col_decile_thr are all null. Keep if some are null + mask = (~tb["check_total"]) & (tb[col_decile_thr].notnull().any(axis=1)) + + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with thresholds not monotonically increasing and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type']], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + tb = tb[~mask].reset_index(drop=True) + + ############################ + # Shares monotonicity check + m_check_vars = [] + for i in range(1, 11): + if i > 1: + check_varname = f"m_check_{i}" + tb[check_varname] = tb[f"decile{i}_share"] >= tb[f"decile{i-1}_share"] + m_check_vars.append(check_varname) + + tb["check_total"] = tb[m_check_vars].all(axis=1) + + # Drop rows if columns in col_decile_share are all null. Keep if some are null + mask = (~tb["check_total"]) & (tb[col_decile_share].notnull().any(axis=1)) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with shares not monotonically increasing and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type'] + col_decile_share], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + tb = tb[~mask].reset_index(drop=True) + + ############################ + # Shares not adding up to 100% + + tb["sum_pct"] = tb[col_decile_share].sum(axis=1) + + # Drop rows if columns in col_decile_share are all null. Keep if some are null + mask = (tb["sum_pct"] >= 100.1) | (tb["sum_pct"] <= 99.9) & (tb[col_decile_share].notnull().any(axis=1)) + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""{len(tb_error)} observations of shares are not adding up to 100% and will be deleted: + {tabulate(tb_error[['country', 'year', 'reporting_level', 'welfare_type', 'sum_pct']], headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + tb = tb[~mask].reset_index(drop=True) + + ############################ + # delete columns created for the checks + tb = tb.drop(columns=m_check_vars + ["m_check_1", "check_total", "sum_pct"]) + + obs_after_checks = len(tb) + log.info(f"Sanity checks deleted {obs_before_checks - obs_after_checks} observations for {ppp_version} PPPs.") + + return tb + + +def separate_ppp_data(tb: Table) -> Tuple[Table, Table]: + """ + Separate out ppp data from the main dataset + """ + + # Filter table to include only the right ppp_version + # Also, drop columns with all NaNs (which are the ones that are not relevant for the ppp_version) + tb_2011 = tb[tb["ppp_version"] == 2011].dropna(axis=1, how="all").reset_index(drop=True).copy() + tb_2017 = tb[tb["ppp_version"] == 2017].dropna(axis=1, how="all").reset_index(drop=True).copy() + + return tb_2011, tb_2017 + + +def inc_or_cons_data(tb: Table) -> Tuple[Table, Table, Table]: + """ + Separate income and consumption data + """ + + # Separate out consumption-only, income-only. Also, create a table with both income and consumption + tb_inc = tb[tb["welfare_type"] == "income"].reset_index(drop=True).copy() + tb_cons = tb[tb["welfare_type"] == "consumption"].reset_index(drop=True).copy() + tb_inc_or_cons = tb.copy() + + # If both inc and cons are available in a given year, drop inc + + # Flag duplicates – indicating multiple welfare_types + # Sort values to ensure the welfare_type consumption is marked as False when there are multiple welfare types + tb_inc_or_cons = tb_inc_or_cons.sort_values( + by=["ppp_version", "country", "year", "reporting_level", "welfare_type"], ignore_index=True + ) + tb_inc_or_cons["duplicate_flag"] = tb_inc_or_cons.duplicated( + subset=["ppp_version", "country", "year", "reporting_level"] + ) + + # Drop income where income and consumption are available + tb_inc_or_cons = tb_inc_or_cons[ + (~tb_inc_or_cons["duplicate_flag"]) | (tb_inc_or_cons["welfare_type"] == "consumption") + ] + tb_inc_or_cons.drop(columns=["duplicate_flag"], inplace=True) + + tb_inc_or_cons = check_jumps_in_grapher_dataset(tb_inc_or_cons) + + tb_inc_or_cons = remove_confusing_datapoints_in_grapher_dataset(tb_inc_or_cons) + + return tb_inc, tb_cons, tb_inc_or_cons + + +def regional_headcount(tb: Table) -> Table: + """ + Create regional headcount dataset, by patching missing values with the difference between world and regional headcount + """ + + # Keep only regional data: for regions, these are the reporting_level rows not in ['national', 'urban', 'rural'] + tb_regions = tb[~tb["reporting_level"].isin(["national", "urban", "rural"])].reset_index(drop=True).copy() + + # Remove Western and Central and Eastern and Southern Africa. It's redundant with Sub-Saharan Africa (PIP) + tb_regions = tb_regions[ + ~tb_regions["country"].isin(["Western and Central Africa (PIP)", "Eastern and Southern Africa (PIP)"]) + ].reset_index(drop=True) + + # Select needed columns and pivot + tb_regions = tb_regions[["country", "year", "headcount_215"]] + tb_regions = tb_regions.pivot(index="year", columns="country", values="headcount_215") + + # Drop rows with more than one region with null headcount + tb_regions["check_total"] = tb_regions[tb_regions.columns].isnull().sum(axis=1) + mask = tb_regions["check_total"] > 1 + + tb_out = tb_regions[mask].reset_index() + if len(tb_out) > 0: + log.info( + f"""There are {len(tb_out)} years with more than one null region value and will be deleted: + {list(tb_out.year.unique())}""" + ) + tb_regions = tb_regions[~mask].reset_index() + tb_regions = tb_regions.drop(columns="check_total") + + # Get difference between world and (total) regional headcount, to patch rows with one missing value + cols_to_sum = [e for e in list(tb_regions.columns) if e not in ["year", "World"]] + tb_regions["sum_regions"] = tb_regions[cols_to_sum].sum(axis=1) + + tb_regions["diff_world_regions"] = tb_regions["World"] - tb_regions["sum_regions"] + + # Fill null values with the difference and drop aux variables + col_dictionary = dict.fromkeys(cols_to_sum, tb_regions["diff_world_regions"]) + tb_regions.loc[:, cols_to_sum] = tb_regions[cols_to_sum].fillna(col_dictionary) + tb_regions = tb_regions.drop(columns=["World", "sum_regions", "diff_world_regions"]) + + # NOTE: I am not extracting data for China and India at least for now, because we are only extracting non filled data + # The data originally came from filled data to plot properly. + + # # Get headcount values for China and India + # df_chn_ind = tb[(tb["country"].isin(["China", "India"])) & (tb["reporting_level"] == "national")].reset_index( + # drop=True + # ) + # df_chn_ind = df_chn_ind[["country", "year", "headcount_215"]] + + # # Make table wide and merge with regional data + # df_chn_ind = df_chn_ind.pivot(index="year", columns="country", values="headcount_215").reset_index() + # tb_regions = pr.merge(tb_regions, df_chn_ind, on="year", how="left") + + # tb_regions["East Asia and Pacific excluding China"] = ( + # tb_regions["East Asia and Pacific (PIP)"] - tb_regions["China"] + # ) + # tb_regions["South Asia excluding India"] = tb_regions["South Asia (PIP)"] - tb_regions["India"] + + tb_regions = pr.melt(tb_regions, id_vars=["year"], var_name="country", value_name="headcount_215") + tb_regions = tb_regions[["country", "year", "headcount_215"]] + + # Rename headcount_215 to headcount_215_region, to distinguish it from the original headcount_215 when merging + tb_regions = tb_regions.rename(columns={"headcount_215": "headcount_215_regions"}) + + # Merge with original table + tb = pr.merge(tb, tb_regions, on=["country", "year"], how="outer") + + return tb + + +def survey_count(tb: Table) -> Table: + """ + Create survey count indicator, by counting the number of surveys available for each country in the past decade + """ + # Remove regions from the table + tb_survey = tb[~tb["reporting_level"].isnull()].reset_index(drop=True).copy() + + min_year = int(tb_survey["year"].min()) + max_year = int(tb_survey["year"].max()) + year_list = list(range(min_year, max_year + 1)) + country_list = list(tb_survey["country"].unique()) + + # Create two tables with all the years and entities + year_tb_survey = Table(year_list) + entity_tb_survey = Table(country_list) + + # Make a cartesian product of both dataframes: join all the combinations between all the entities and all the years + cross = pr.merge(entity_tb_survey, year_tb_survey, how="cross") + cross = cross.rename(columns={"0_x": "country", "0_y": "year"}) + + # Merge cross and df_country, to include all the possible rows in the dataset + tb_survey = pr.merge(cross, tb_survey[["country", "year"]], on=["country", "year"], how="left", indicator=True) + + # Mark with 1 if there are surveys available, 0 if not (this is done by checking if the row is in both datasets) + tb_survey["survey_available"] = 0 + tb_survey.loc[tb_survey["_merge"] == "both", "survey_available"] = 1 + + # Sum for each entity the surveys available for the previous 9 years and the current year + tb_survey["surveys_past_decade"] = ( + tb_survey["survey_available"] + .groupby(tb_survey["country"], sort=False) + .rolling(min_periods=1, window=10) + .sum() + .values + ) + + # Copy metadata + tb_survey["surveys_past_decade"] = tb_survey["surveys_past_decade"].copy_metadata(tb["reporting_level"]) + + # Keep columns needed + tb_survey = tb_survey[["country", "year", "surveys_past_decade"]] + + # Merge with original table + tb = pr.merge(tb_survey, tb, on=["country", "year"], how="left") + + return tb + + +def set_index_and_sort(tb: Table, index_cols: list) -> Table: + """ + Add index and sort + """ + + tb = tb.set_index(index_cols, verify_integrity=True).sort_index() + + return tb + + +def drop_columns(tb: Table) -> Table: + """ + Drop columns not needed + """ + + # Remove columns + tb = tb.drop( + columns=[ + "ppp_version", + "reporting_pop", + "is_interpolated", + "distribution_type", + "estimation_type", + "survey_comparability", + "comparable_spell", + ] + ) + + return tb + + +def create_survey_spells(tb: Table) -> list: + """ + Create tables for each indicator and survey spells, to be able to graph them in explorers. + """ + + tb = tb.copy() + + # drop rows where survey coverage = nan (This is just regions) + tb = tb[tb["survey_comparability"].notna()].reset_index() + + # Add 1 to make comparability var run from 1, not from 0 + tb["survey_comparability"] += 1 + + # Note the welfare type in the comparability spell + tb["survey_comparability"] = ( + tb["welfare_type"].astype(str) + "_spell_" + tb["survey_comparability"].astype(int).astype(str) + ) + + # Remove columns not needed: stacked, above, etc + drop_list = ["above", "between", "poverty_severity", "watts"] + for var in drop_list: + tb = tb[tb.columns.drop(list(tb.filter(like=var)))] + + vars = [ + i + for i in tb.columns + if i + not in [ + "country", + "year", + "ppp_version", + "reporting_level", + "welfare_type", + "reporting_pop", + "is_interpolated", + "distribution_type", + "estimation_type", + "survey_comparability", + "comparable_spell", + "headcount_215_regions", + "surveys_past_decade", + ] + ] + + # Define spell table list + spell_tables = [] + + # Loop over the variables in the main dataset + for select_var in vars: + tb_var = tb[["country", "year", select_var, "survey_comparability"]].copy() + + # convert to wide + tb_var = pr.pivot( + tb_var, + index=["country", "year"], + columns=["survey_comparability"], + values=select_var, + ) + + tb_var.metadata.short_name = f"{tb_var.metadata.short_name}_{select_var}" + + spell_tables.append(tb_var) + + return spell_tables + + +def create_survey_spells_inc_cons(tb_inc: Table, tb_cons: Table) -> list: + """ + Create table for each indicator and survey spells, to be able to graph them in explorers. + This version recombines income and consumption tables to not lose dropped rows. + """ + + tb_inc = tb_inc.reset_index().copy() + tb_cons = tb_cons.reset_index().copy() + + # Concatenate the two tables + tb_inc_or_cons_2017_spells = pr.concat([tb_inc, tb_cons], ignore_index=True, short_name="income_consumption_2017") + + # Set index and sort + tb_inc_or_cons_2017_spells = set_index_and_sort( + tb=tb_inc_or_cons_2017_spells, index_cols=["country", "year", "reporting_level", "welfare_type"] + ) + + # Create spells + spell_tables = create_survey_spells(tb_inc_or_cons_2017_spells) + + return spell_tables + + +def combine_tables_2011_2017(tb_2011: Table, tb_2017: Table, short_name: str) -> Table: + """ + Combine income and consumption tables from 2011 and 2017 PPPs. + We will use this table for the Poverty Data Explorer: World Bank data - 2011 vs. 2017 prices. + """ + + # Identify columns to use (ID + indicators) + id_cols = ["country", "year"] + + tb_2011 = define_columns_for_ppp_comparison(tb=tb_2011, id_cols=id_cols, ppp_version=2011) + tb_2017 = define_columns_for_ppp_comparison(tb=tb_2017, id_cols=id_cols, ppp_version=2017) + + # Rename all the non-id columns with the suffix _ppp(year) + # (the suffix option in merge only adds suffix when columns coincide) + tb_2011 = tb_2011.rename(columns={c: c + "_ppp2011" for c in tb_2011.columns if c not in id_cols}) + tb_2017 = tb_2017.rename(columns={c: c + "_ppp2017" for c in tb_2017.columns if c not in id_cols}) + + # Merge the two files (it's OK to have an inneer join, because we want to keep country-year pairs that are in both files) + tb_2011_2017 = pr.merge(tb_2011, tb_2017, on=id_cols, validate="one_to_one", short_name=short_name) + + # Add index and sort + tb_2011_2017 = tb_2011_2017.set_index(["country", "year"], verify_integrity=True).sort_index() + + return tb_2011_2017 + + +def define_columns_for_ppp_comparison(tb: Table, id_cols: list, ppp_version: int) -> Table: + """ + Define columns to use for the comparison of 2011 and 2017 PPPs + """ + + tb = tb.reset_index() + # Define poverty lines + povlines_list = povlines_dict[ppp_version] + + # Define groups of columns + headcount_absolute_cols = [f"headcount_{p}" for p in povlines_list] + headcount_ratio_absolute_cols = [f"headcount_ratio_{p}" for p in povlines_list] + + headcount_relative_cols = [f"headcount_{rel}_median" for rel in [40, 50, 60]] + headcount_ratio_relative_cols = [f"headcount_ratio_{rel}_median" for rel in [40, 50, 60]] + + # Define all the columns to filter + + cols_list = ( + id_cols + + headcount_absolute_cols + + headcount_ratio_absolute_cols + + headcount_relative_cols + + headcount_ratio_relative_cols + + ["mean", "median", "decile1_thr", "decile9_thr"] + ) + + # Filter columns + tb = tb[cols_list] + + return tb + + +def regional_data_from_1990(tb: Table, regions_list: list) -> Table: + """ + Select regional data only from 1990 onwards, due to the uncertainty in 1980s data + """ + # Create a regions table + tb_regions = tb[(tb["year"] >= 1990) & (tb["country"].isin(regions_list))].reset_index(drop=True).copy() + + # Remove regions from tb + tb = tb[~tb["country"].isin(regions_list)].reset_index(drop=True).copy() + + # Concatenate both tables + tb = pr.concat([tb, tb_regions], ignore_index=True) + return tb + + +def check_jumps_in_grapher_dataset(tb: Table) -> Table: + """ + Check for jumps in the dataset, which can be caused by combining income and consumption estimates for one country series. + """ + # For each country, year, welfare_type and reporting_level, check if the difference between the columns is too high + + # Define columns to check: all the headcount ratio columns + cols_to_check = [ + col for col in tb.columns if "headcount_ratio" in col and "above" not in col and "between" not in col + ] + + for col in cols_to_check: + # Create a new column, shift_col, that is the same as col but shifted one row down for each country, year, welfare_type and reporting_level + tb["shift_col"] = tb.groupby(["country", "reporting_level"])[col].shift(1) + + # Create shift_year column + tb["shift_year"] = tb.groupby(["country", "reporting_level"])["year"].shift(1) + + # Create shift_welfare_type column + tb["shift_welfare_type"] = tb.groupby(["country", "reporting_level"])["welfare_type"].shift(1) + + # Calculate the difference between col and shift_col + tb["check_diff_column"] = tb[col] - tb["shift_col"] + + # Calculate the difference between years + tb["check_diff_year"] = tb["year"] - tb["shift_year"] + + # Calculate if the welfare type is the same + tb["check_diff_welfare_type"] = tb["welfare_type"] == tb["shift_welfare_type"] + + # Check if the difference is too high + mask = (abs(tb["check_diff_column"]) > 20) & (tb["check_diff_year"] <= 5) & ~tb["check_diff_welfare_type"] + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.warning( + f"""There are {len(tb_error)} observations with abnormal jumps for {col}: + {tabulate(tb_error[['ppp_version', 'country', 'year', 'reporting_level', col, 'check_diff_column', 'check_diff_year']].sort_values('year').reset_index(drop=True), headers = 'keys', tablefmt = TABLEFMT, floatfmt=".1f")}""" + ) + # tb = tb[~mask].reset_index(drop=True) + + # Drop the columns created for the check + tb = tb.drop( + columns=[ + "shift_col", + "shift_year", + "shift_welfare_type", + "check_diff_column", + "check_diff_year", + "check_diff_welfare_type", + ] + ) + + return tb + + +def remove_confusing_datapoints_in_grapher_dataset(tb: Table) -> Table: + """ + Remove datapoints that are confusing when we are showing a unique series for both income and consumption. + """ + + # Define columns to keep data. Inequality is mostly not affected by the income/consumption choice + cols_to_keep = [ + "country", + "year", + "reporting_level", + "welfare_type", + "gini", + "mld", + "decile1_share", + "decile2_share", + "decile3_share", + "decile4_share", + "decile5_share", + "decile6_share", + "decile7_share", + "decile8_share", + "decile9_share", + "decile10_share", + "bottom50_share", + "middle40_share", + "headcount_40_median", + "headcount_50_median", + "headcount_60_median", + "headcount_ratio_40_median", + "headcount_ratio_50_median", + "headcount_ratio_60_median", + "income_gap_ratio_40_median", + "income_gap_ratio_50_median", + "income_gap_ratio_60_median", + "poverty_gap_index_40_median", + "poverty_gap_index_50_median", + "poverty_gap_index_60_median", + "avg_shortfall_40_median", + "avg_shortfall_50_median", + "avg_shortfall_60_median", + "total_shortfall_40_median", + "total_shortfall_50_median", + "total_shortfall_60_median", + "poverty_severity_40_median", + "poverty_severity_50_median", + "poverty_severity_60_median", + "waits_40_median", + "waits_50_median", + "waits_60_median", + "palma_ratio", + "s80_s20_ratio", + "p90_p10_ratio", + "p90_p50_ratio", + "p50_p10_ratio", + ] + + # Make nan the data for Poland from 2020 onwards, except for columns in cols_to_keep + mask = (tb["country"] == "Poland") & (tb["year"] >= 2020) + tb.loc[mask, tb.columns.difference(cols_to_keep)] = np.nan + + return tb diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml new file mode 100644 index 00000000000..c25d010b2fc --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.meta.yml @@ -0,0 +1,4 @@ +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + title: World Bank Poverty and Inequality Platform (PIP) (2011 prices) diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py new file mode 100644 index 00000000000..92ee99a0383 --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2011ppp.py @@ -0,0 +1,34 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("world_bank_pip") + + # Read tables from garden dataset. + tb = ds_garden["income_consumption_2011"] + + # + # Process data. + # + # Drop reporting_level and welfare_type columns + tb = tb.drop(columns=["reporting_level", "welfare_type"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml new file mode 100644 index 00000000000..4afca360dd5 --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.meta.yml @@ -0,0 +1,4 @@ +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/dataset/ +dataset: + title: World Bank Poverty and Inequality Platform (PIP) (2017 prices) diff --git a/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py new file mode 100644 index 00000000000..319cf11c36e --- /dev/null +++ b/etl/steps/data/grapher/wb/2024-03-27/world_bank_pip_2017ppp.py @@ -0,0 +1,34 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("world_bank_pip") + + # Read tables from garden dataset. + tb = ds_garden["income_consumption_2017"] + + # + # Process data. + # + # Drop reporting_level and welfare_type columns + tb = tb.drop(columns=["reporting_level", "welfare_type"]) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py b/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py new file mode 100644 index 00000000000..90c84c0726d --- /dev/null +++ b/etl/steps/data/meadow/wb/2024-03-27/world_bank_pip.py @@ -0,0 +1,51 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + # For key indicators + snap = paths.load_snapshot("world_bank_pip.csv") + tb = snap.read() + + # For percentiles + snap_percentiles = paths.load_snapshot("world_bank_pip_percentiles.csv") + tb_percentiles = snap_percentiles.read() + + # + # Process data. + # + + # Make reporting_level and welfare_type strings + tb["reporting_level"] = tb["reporting_level"].astype(str) + tb["welfare_type"] = tb["welfare_type"].astype(str) + tb_percentiles["reporting_level"] = tb_percentiles["reporting_level"].astype(str) + tb_percentiles["welfare_type"] = tb_percentiles["welfare_type"].astype(str) + + # Set index and sort + tb = tb.set_index( + ["ppp_version", "poverty_line", "country", "year", "reporting_level", "welfare_type"], verify_integrity=True + ).sort_index() + + tb_percentiles = tb_percentiles.set_index( + ["ppp_version", "country", "year", "reporting_level", "welfare_type", "percentile"], + verify_integrity=True, + ).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, tables=[tb, tb_percentiles], check_variables_metadata=True, default_metadata=snap.metadata + ) + + # Save changes in the new garden dataset. + ds_meadow.save() diff --git a/snapshots/wb/2024-03-27/pip_api.py b/snapshots/wb/2024-03-27/pip_api.py new file mode 100644 index 00000000000..8b4d17d1c94 --- /dev/null +++ b/snapshots/wb/2024-03-27/pip_api.py @@ -0,0 +1,1573 @@ +""" +DATA EXTRACTION FOR THE WORLD BANK POVERTY AND INEQUALITY PLATFORM (PIP) API + +This code generates key indicators and percentiles from the World Bank PIP API. +This is done by combining the results of several queries to the API: + - A set of poverty lines (8) to obtain key indicators per PPP year (2011, 2017) and for countries and regions. + - 2298 poverty lines to construct percentiles for a group of countries. + - 5148 poverty lines to construct percentiles for all the regions. + - 8217 of poverty lines to construct estimates of relative poverty. + +Percentiles are partially constructed because the data officially published by the World Bank is missing some countries and all the regions. + +To run this code from scratch, + - Connect to the staging server of this pull request: + - Hit Cmd + Shift + P and select Remote-SSH: Connect to Host + - Type in owid@staging-site-{pull_request_name} + - Delete the files in the cache folder: + rm -rf .cache/* + - Check if you need to update the poverty lines in the functions `poverty_lines_countries` and `poverty_lines_regions`. Run + - https://api.worldbank.org/pip/v1/pip?country=CHN&year=all&povline=80&fill_gaps=false&welfare_type=all&reporting_level=all&additional_ind=false&ppp_version=2017&identity=PROD&format=csv + - https://api.worldbank.org/pip/v1/pip-grp?country=OHI&year=all&povline=300&group_by=wb&welfare_type=all&reporting_level=all&additional_ind=false&ppp_version=2017&format=csv + - And see if any of the `headcount` values is lower than 0.99. If so, you need to add more poverty lines to the functions. + - Run the code. You have two options to see the output, in the terminal or in the background: + python snapshots/wb/{version}/pip_api.py + nohup poetry run python snapshots/wb/{version}/pip_api.py > output.log 2>&1 & + +When the code finishes, you will have the following files in the cache folder: + - world_bank_pip.csv: file with the results of the queries for key indicators (8 for countries and 8 for regions), plus some additional indicators (thresholds, relative poverty). + - pip_percentiles.csv: file with the percentiles taken from WB Databank and the ones constructed from the API. + +Copy these files to this folder and run in the terminal: + python snapshots/wb/{version}/world_bank_pip.py --path-to-file snapshots/wb/{version}/world_bank_pip.csv + python snapshots/wb/{version}/world_bank_pip_percentiles.py --path-to-file snapshots/wb/{version}/pip_percentiles.csv + +You can delete the files after this. + +""" + + +import io +import time +from multiprocessing.pool import ThreadPool +from pathlib import Path + +import click +import numpy as np +import pandas as pd +import requests +from botocore.exceptions import ClientError +from joblib import Memory +from structlog import get_logger +from tenacity import retry +from tenacity.stop import stop_after_attempt +from tenacity.wait import wait_random_exponential + +from etl.files import checksum_str +from etl.paths import CACHE_DIR +from etl.publish import connect_s3_cached + +# Initialize logger. +log = get_logger() + +memory = Memory(CACHE_DIR, verbose=0) + +# Basic parameters to use in the functions +MAX_REPEATS = 15 +TIMEOUT = 500 +FILL_GAPS = "false" +# NOTE: Although the number of workers is set to MAX_WORKERS, the actual number of workers for regional queries is half of that, because the API (`pip-grp`) is less able to handle concurrent requests. +MAX_WORKERS = 2 +TOLERANCE_PERCENTILES = 1 + + +# Select live (1) or internal (0) API +LIVE_API = 1 + + +# Constants +def poverty_lines_countries(): + """ + These poverty lines are used to calculate percentiles for countries that are not in the percentile file. + # We only extract to $80 because the highest P99 not available is China, with $64.5 + # NOTE: In future updates, check if these poverty lines are enough for the extraction + """ + # Define poverty lines and their increase + + under_2_dollars = list(range(1, 200, 1)) + between_2_and_5_dollars = list(range(200, 500, 2)) + between_5_and_10_dollars = list(range(500, 1000, 5)) + between_10_and_20_dollars = list(range(1000, 2000, 10)) + between_20_and_30_dollars = list(range(2000, 3000, 10)) + between_30_and_55_dollars = list(range(3000, 5500, 10)) + between_55_and_80_dollars = list(range(5500, 8000, 10)) + between_80_and_100_dollars = list(range(8000, 10000, 10)) + between_100_and_150_dollars = list(range(10000, 15000, 10)) + + # povlines is all these lists together + povlines = ( + under_2_dollars + + between_2_and_5_dollars + + between_5_and_10_dollars + + between_10_and_20_dollars + + between_20_and_30_dollars + + between_30_and_55_dollars + + between_55_and_80_dollars + + between_80_and_100_dollars + + between_100_and_150_dollars + ) + + return povlines + + +def poverty_lines_regions(): + """ + These poverty lines are used to calculate percentiles for regions. None of them are in the percentile file. + # We only extract to $300 because the highest P99 not available is Other High Income Countries, with $280 + # NOTE: In future updates, check if these poverty lines are enough for the extraction + """ + # Define poverty lines and their increase + + under_2_dollars = list(range(1, 200, 1)) + between_2_and_5_dollars = list(range(200, 500, 2)) + between_5_and_10_dollars = list(range(500, 1000, 5)) + between_10_and_20_dollars = list(range(1000, 2000, 10)) + between_20_and_30_dollars = list(range(2000, 3000, 10)) + between_30_and_55_dollars = list(range(3000, 5500, 10)) + between_55_and_80_dollars = list(range(5500, 8000, 10)) + between_80_and_100_dollars = list(range(8000, 10000, 10)) + between_100_and_150_dollars = list(range(10000, 15000, 10)) + between_150_and_175_dollars = list(range(15000, 17500, 10)) + between_175_and_250_dollars = list(range(17500, 25000, 20)) + between_250_and_300_dollars = list(range(25000, 30000, 50)) + + # povlines is all these lists together + povlines = ( + under_2_dollars + + between_2_and_5_dollars + + between_5_and_10_dollars + + between_10_and_20_dollars + + between_20_and_30_dollars + + between_30_and_55_dollars + + between_55_and_80_dollars + + between_80_and_100_dollars + + between_100_and_150_dollars + + between_150_and_175_dollars + + between_175_and_250_dollars + + between_250_and_300_dollars + ) + + return povlines + + +# Define poverty lines for key indicators, depending on the PPP version. +# It includes the international poverty line, lower and upper-middle income lines, and some other lines. +POVLINES_DICT = { + 2011: [100, 190, 320, 550, 1000, 2000, 3000, 4000], + 2017: [100, 215, 365, 685, 1000, 2000, 3000, 4000], +} + + +PPP_VERSIONS = [2011, 2017] +POV_LINES_COUNTRIES = poverty_lines_countries() +POV_LINES_REGIONS = poverty_lines_regions() + +# # DEBUGGING +# PPP_VERSIONS = [2017] +# POV_LINES_COUNTRIES = [1, 1000, 25000, 50000] +# POV_LINES_REGIONS = [1, 1000, 25000, 50000] + + +@click.command() +@click.option( + "--live-api/--internal-api", + default=True, + type=bool, + help="Select live (1) or internal (0) API", +) +# @click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def run(live_api: bool) -> None: + if live_api: + wb_api = WB_API("https://api.worldbank.org/pip/v1") + else: + wb_api = WB_API("https://apiv2qa.worldbank.org/pip/v1") + + # Generate percentiles by extracting the raw files and processing them afterward + df_percentiles = generate_consolidated_percentiles(generate_percentiles_raw(wb_api), wb_api) + + # Generate relative poverty indicators file + df_relative = generate_relative_poverty(wb_api) + + # Generate key indicators file and patch medians + df = generate_key_indicators(wb_api) + df = median_patch(df, country_or_region="country") + + # Add relative poverty indicators and decile thresholds to the key indicators file + df = add_relative_poverty_and_decile_threholds(df, df_relative, df_percentiles) + + +class WB_API: + def __init__(self, api_address, check_health=False): + self.api_address = api_address + self.check_health = check_health + + def health_check(self): + return pd.read_json(f"{self.api_address}/health-check")[0][0] + + def api_health(self): + """ + Check if the API is running. + """ + if not self.check_health: + return + + # Initialize repeat counter + repeat = 0 + + # health comes from a json containing the status + health = self.health_check() + + # If the status is different to "PIP API is running", repeat the request until MAX_REPEATS + while health != "PIP API is running" and repeat < MAX_REPEATS: + repeat += 1 + + if repeat >= MAX_REPEATS: + # If the status is different to "PIP API is running" after MAX_REPEATS, log fatal error + raise AssertionError(f"Health check: {health} (repeated {repeat} times)") + + def versions(self): + return memory.cache(pd.read_csv)(f"{self.api_address}/versions?format=csv") + + def get_table(self, table): + return pd.read_csv(f"{self.api_address}/aux?table={table}&long_format=false&format=csv") + + def fetch_csv(self, url): + return _fetch_csv(f"{self.api_address}{url}") + + +@retry(wait=wait_random_exponential(multiplier=1), stop=stop_after_attempt(MAX_REPEATS)) +def _get_request(url: str) -> requests.Response: + response = requests.get(url, timeout=TIMEOUT) + if response.status_code != 200: + log.info("fetch_csv.retry", url=url) + raise Exception("API timed out") + + if b"Server Error" in response.content: + raise Exception("API returned server error") + + return response + + +@memory.cache +def _fetch_csv(url: str) -> pd.DataFrame: + r2 = connect_s3_cached() + r2_bucket = "owid-private" + r2_key = "cache/pip_api/" + checksum_str(url) + + # try to get it from cache + try: + obj = r2.get_object(Bucket=r2_bucket, Key=r2_key) + s = obj["Body"].read().decode("utf-8") + # we might have cached invalid responses, in that case fetch it again + if "Server Error" not in s: + df = pd.read_csv(io.StringIO(s)) + log.info("fetch_csv.cache_hit", url=url) + return df + else: + log.info("fetch_csv.cache_with_error", url=url) + except ClientError: + pass + + log.info("fetch_csv.start", url=url) + response = _get_request(url) + log.info("fetch_csv.success", url=url, t=response.elapsed.total_seconds()) + + # save the result to R2 cache + r2.put_object( + Body=response.content, + Bucket=r2_bucket, + Key=r2_key, + ) + + df = pd.read_csv(io.StringIO(response.content.decode("utf-8"))) + return df + + +@memory.cache +def _fetch_percentiles(version: int) -> pd.DataFrame: + # These URLs were copied from https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles + if version == 2011: + url = "https://datacatalogfiles.worldbank.org/ddh-published/0063646/DR0090357/world_100bin.csv" + elif version == 2017: + url = "https://datacatalogfiles.worldbank.org/ddh-published/0063646/DR0090251/world_100bin.csv" + else: + raise ValueError(f"Version {version} is not supported") + return pd.read_csv(url) + + +############################################################################################################ +# FUNCTIONS + + +def pip_aux_tables(wb_api: WB_API, table="all"): + """ + Download aux tables if the API is running. + """ + + wb_api.api_health() + + if table == "all": + aux_tables_list = [ + "aux_versions", + "countries", + "country_coverage", + "country_list", + "cpi", + "decomposition", + "dictionary", + "framework", + "gdp", + "incgrp_coverage", + "indicators", + "interpolated_means", + "missing_data", + "national_poverty_lines", + "pce", + "pop", + "pop_region", + "poverty_lines", + "ppp", + "region_coverage", + "regions", + "spl", + "survey_means", + ] + # Create a list of dataframes + df_dict = {} + + # Download each table and append it to the list + for tab in aux_tables_list: + df = wb_api.get_table(tab) + + # Add table to df_dict + df_dict[tab] = df + + else: + df = wb_api.get_table(table) + + # Add table to df_dict + df_dict = {table: df} + + log.info(f'Auxiliary tables downloaded ("{table}")') + + return df_dict + + +def pip_versions(wb_api) -> dict: + """ + Download latest PIP data versions if the API is running. + """ + + wb_api.api_health() + + df = wb_api.versions() + df = df[["ppp_version", "release_version", "version"]] + + # Obtain the max release_version + max_release_version = df["release_version"].max() + + # Get the version for ppp_versions 2011 and 2017 + versions = df[df["release_version"] == max_release_version] + + # Set index and convert to dict + versions = versions.set_index("ppp_version", verify_integrity=True).sort_index().to_dict(orient="index") + + version_2011 = versions[2011]["version"] + version_2017 = versions[2017]["version"] + + log.info(f"PIP dataset versions extracted: 2011 = {version_2011}, 2017 = {version_2017}") + + return versions + + +def pip_query_country( + wb_api: WB_API, + popshare_or_povline, + value, + versions, + country_code="all", + year="all", + fill_gaps="true", + welfare_type="all", + reporting_level="all", + ppp_version=2017, + download="false", +) -> pd.DataFrame: + """ + Query country data from the PIP API. + """ + + # Test health of the API + wb_api.api_health() + + # Round povline (popshare) to 2 decimals to work with cents as the minimum unit + value = round(value, 2) + + # Extract version and release_version from versions dict + version = versions[ppp_version]["version"] + release_version = versions[ppp_version]["release_version"] + + # Build query + df = wb_api.fetch_csv( + f"/pip?{popshare_or_povline}={value}&country={country_code}&year={year}&fill_gaps={fill_gaps}&welfare_type={welfare_type}&reporting_level={reporting_level}&ppp_version={ppp_version}&version={version}&release_version={release_version}&format=csv" + ) + + # Add PPP version as column + df["ppp_version"] = ppp_version + + # Replace names of columns and drop redundancies + df = df.rename(columns={"country_name": "country", "reporting_year": "year"}) + df = df.drop(columns=["region_name", "region_code"]) + + # Reorder columns: ppp_version, country, year, povline and the rest + first_columns = ["ppp_version", "country", "year", "poverty_line"] + df = df[first_columns + [column for column in df.columns if column not in first_columns]] + + if download == "true": + # make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True) + # Save to csv + df.to_csv( + f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_{year}_{popshare_or_povline}_{int(round(value*100))}_welfare_{welfare_type}_rep_{reporting_level}_fillgaps_{fill_gaps}_ppp_{ppp_version}.csv", + index=False, + ) + + if country_code == "all": + log.info(f"Country data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs)") + else: + log.info( + f"Country data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs) in {country_code} (year = {year})" + ) + + return df + + +def pip_query_region( + wb_api: WB_API, + popshare_or_povline, + value, + versions, + country_code="all", + year="all", + welfare_type="all", + reporting_level="all", + ppp_version=2017, + download="false", +) -> pd.DataFrame: + """ + Query regional data from the PIP API. + """ + + # Test health of the API + wb_api.api_health() + + # Round povline (popshare) to 2 decimals to work with cents as the minimum unit + value = round(value, 2) + + # Extract version and release_version from versions dict + version = versions[ppp_version]["version"] + release_version = versions[ppp_version]["release_version"] + + # Build query + df = wb_api.fetch_csv( + f"/pip-grp?{popshare_or_povline}={value}&country={country_code}&year={year}&welfare_type={welfare_type}&reporting_level={reporting_level}&ppp_version={ppp_version}&version={version}&release_version={release_version}&format=csv" + ) + + # Add PPP version as column + df["ppp_version"] = ppp_version + + # Replace names of columns and drop redundancies + df = df.rename(columns={"region_name": "country", "reporting_year": "year", "region_code": "country_code"}) + + # Reorder columns: ppp_version, country, year, povline and the rest + first_columns = ["ppp_version", "country", "year", "poverty_line"] + df = df[first_columns + [column for column in df.columns if column not in first_columns]] + + if download == "true": + # make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True) + # Save to csv + df.to_csv( + f"{CACHE_DIR}/pip_region_data/pip_region_{country_code}_year_{year}_{popshare_or_povline}_{int(round(value*100))}_ppp_{ppp_version}.csv", + index=False, + ) + + if country_code == "all": + log.info(f"Regional data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs)") + else: + log.info( + f"Regional data extracted for {popshare_or_povline} = {value} ({ppp_version} PPPs) in {country_code} (year = {year})" + ) + + return df + + +# GENERATE PERCENTILES FILES +# This is data not given directly by the query, but we can get it by querying a huge set of poverty lines and assign percentiles according to headcount ratio results. + + +def generate_percentiles_raw(wb_api: WB_API): + """ + Generates percentiles data from query results. This is the raw data to get the percentiles. + Uses concurrent.futures to speed up the process. + """ + start_time = time.time() + + def get_percentiles_data(povline, versions, ppp_version, country_code): + """ + Check if country percentiles data exists. If not, run the query. + """ + if Path( + f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_all_povline_{povline}_welfare_all_rep_all_fillgaps_{FILL_GAPS}_ppp_{ppp_version}.csv" + ).is_file(): + return + + else: + return pip_query_country( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code=country_code, + year="all", + fill_gaps=FILL_GAPS, + welfare_type="all", + reporting_level="all", + ppp_version=ppp_version, + download="true", + ) + + def concurrent_percentiles_function(country_code): + """ + Executes get_percentiles_data concurrently. + """ + # Make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True) + + with ThreadPool(MAX_WORKERS) as pool: + tasks = [ + (povline, versions, ppp_version, country_code) + for ppp_version in PPP_VERSIONS + for povline in POV_LINES_COUNTRIES + ] + pool.starmap(get_percentiles_data, tasks) + + def get_percentiles_data_region(povline, versions, ppp_version): + """ + Check if region percentiles data exists. If not, run the query. + """ + if Path( + f"{CACHE_DIR}/pip_region_data/pip_region_all_year_all_povline_{povline}_ppp_{ppp_version}.csv" + ).is_file(): + return + else: + return pip_query_region( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code="all", + year="all", + welfare_type="all", + reporting_level="all", + ppp_version=ppp_version, + download="true", + ) + + def concurrent_percentiles_region_function(): + """ + Executes get_percentiles_data_region concurrently. + """ + # Make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True) + with ThreadPool(MAX_WORKERS) as pool: + tasks = [(povline, versions, ppp_version) for ppp_version in PPP_VERSIONS for povline in POV_LINES_REGIONS] + pool.starmap(get_percentiles_data_region, tasks) + + def get_query_country(povline, ppp_version, country_code): + """ + Here I check if the country file exists even after the original extraction. If it does, I read it. If not, I start the queries again. + """ + file_path_country = f"{CACHE_DIR}/pip_country_data/pip_country_{country_code}_year_all_povline_{povline}_welfare_all_rep_all_fillgaps_{FILL_GAPS}_ppp_{ppp_version}.csv" + if Path(file_path_country).is_file(): + df_query_country = pd.read_csv(file_path_country) + else: + # Run the main function to get the data + log.warning( + f"We need to come back to the extraction! countries = {country_code}, {povline}, {ppp_version} PPPs)" + ) + get_percentiles_data(povline, versions, ppp_version, country_code) + df_query_country = pd.read_csv(file_path_country) + + return df_query_country + + def get_query_region(povline, ppp_version): + """ + Here I check if the regional file exists even after the original extraction. If it does, I read it. If not, I start the queries again. + """ + file_path_region = ( + f"{CACHE_DIR}/pip_region_data/pip_region_all_year_all_povline_{povline}_ppp_{ppp_version}.csv" + ) + if Path(file_path_region).is_file(): + df_query_region = pd.read_csv(file_path_region) + else: + # Run the main function to get the data + log.warning(f"We need to come back to the extraction! regions, {povline}, {ppp_version} PPPs)") + get_percentiles_data_region(povline, versions, ppp_version) + df_query_region = pd.read_csv(file_path_region) + + return df_query_region + + def get_list_of_missing_countries(): + """ + Compare the list of countries in a common query (reference file) and the list of countries in the percentile file. + It generates missing_countries, which is a string with all the elements of the list, in the format for querying multiple countries in the API. + And also missing_countries_list, which is a list of the countries. + """ + # Obtain the percentile files the World Bank publishes in their Databank + + df_percentiles_published_2017 = _fetch_percentiles(2017) + + # FOR COUNTRIES + # Get data from the most common query + df_reference = pip_query_country( + wb_api, + popshare_or_povline="povline", + value=2.15, + versions=versions, + country_code="all", + year="all", + fill_gaps=FILL_GAPS, + welfare_type="all", + reporting_level="all", + ppp_version=2017, + ) + + # Edit percentile file to get the list of different countries + df_percentiles_pub = df_percentiles_published_2017.copy() + df_percentiles_pub = df_percentiles_pub.drop( + columns=["percentile", "avg_welfare", "pop_share", "welfare_share", "quantile"] + ).drop_duplicates() + + # Merge the two files + df_merge = pd.merge( + df_reference, + df_percentiles_pub, + on=["country_code", "year", "reporting_level", "welfare_type"], + how="outer", + indicator=True, + ) + + # Obtain the list of countries that are in the reference file but not in the percentile file + list_missing_countries = df_merge.loc[df_merge["_merge"] == "left_only", "country_code"].unique().tolist() + + # Generate a string with all the elements of the list, in the format for querying multiple countries in the API + missing_countries = "&country=".join(list_missing_countries) + + return missing_countries, list_missing_countries + + # Obtain latest versions of the PIP dataset + versions = pip_versions(wb_api) + + # Run the main function + missing_countries, list_missing_countries = get_list_of_missing_countries() + log.info( + f"These countries are available in a common query but not in the percentile file: {list_missing_countries}" + ) + + concurrent_percentiles_function(country_code=missing_countries) + log.info("Country files downloaded") + concurrent_percentiles_region_function() + log.info("Region files downloaded") + + log.info("Now we are concatenating the files") + + with ThreadPool(MAX_WORKERS) as pool: + tasks = [ + (povline, ppp_version, missing_countries) for ppp_version in PPP_VERSIONS for povline in POV_LINES_COUNTRIES + ] + dfs = pool.starmap(get_query_country, tasks) + + df_country = pd.concat(dfs, ignore_index=True) + log.info("Country files concatenated") + + with ThreadPool(MAX_WORKERS) as pool: + tasks = [(povline, ppp_version) for ppp_version in PPP_VERSIONS for povline in POV_LINES_REGIONS] + dfs = pool.starmap(get_query_region, tasks) + + df_region = pd.concat(dfs, ignore_index=True) + log.info("Region files concatenated") + + # Create poverty_line_cents column, multiplying by 100, rounding and making it an integer + df_country["poverty_line_cents"] = round(df_country["poverty_line"] * 100).astype(int) + df_region["poverty_line_cents"] = round(df_region["poverty_line"] * 100).astype(int) + + log.info("Checking if all the poverty lines are in the concatenated files") + + # Check if all the poverty lines are in the df in country and region df + assert set(df_country["poverty_line_cents"].unique()) == set(POV_LINES_COUNTRIES), log.fatal( + "Not all poverty lines are in the country file!" + ) + assert set(df_region["poverty_line_cents"].unique()) == set(POV_LINES_REGIONS), log.fatal( + "Not all poverty lines are in the region file!" + ) + + # Drop poverty_line_cents column + df_country = df_country.drop(columns=["poverty_line_cents"]) + df_region = df_region.drop(columns=["poverty_line_cents"]) + + log.info("Checking if the set of countries and regions is the same as in PIP") + + # I check if the set of countries is the same in the df and in the list of missing countries + assert set(df_country["country_code"].unique()) == set(list_missing_countries), log.fatal( + f"List of countries is different from the one we needed to extract! ({list_missing_countries})" + ) + + # I check if the set of regions is the same in the df and in the aux table (list of regions) + aux_dict = pip_aux_tables(wb_api, table="regions") + assert set(df_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal( + "List of regions is not the same as the one defined in PIP!" + ) + + log.info("Concatenating the raw percentile data for countries and regions") + + # Concatenate df_country and df_region + df = pd.concat([df_country, df_region], ignore_index=True) + + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + log.info( + f"Concatenation of raw percentile data for countries and regions completed. Execution time: {elapsed_time} seconds" + ) + + return df + + +def calculate_percentile(p, df): + """ + Calculates a single percentile and returns a DataFrame with the results. + """ + df["distance_to_p"] = abs(df["headcount"] * 100 - p) + df_closest = ( + df.sort_values("distance_to_p") + .groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], + as_index=False, + sort=False, + dropna=False, # This is to avoid dropping rows with NaNs (reporting_level and welfare_type for regions) + ) + .first() + ) + df_closest["target_percentile"] = p + df_closest = df_closest[ + [ + "ppp_version", + "country", + "year", + "reporting_level", + "welfare_type", + "target_percentile", + "poverty_line", + "headcount", + "distance_to_p", + ] + ] + log.info(f"Percentile {p}: calculated") + return df_closest + + +def format_official_percentiles(year, wb_api: WB_API): + """ + Download percentiles from the World Bank Databank and format them to the same format as the constructed percentiles + """ + # Load percentile files from the World Bank Databank + df_percentiles_published = _fetch_percentiles(year) + + # Obtain country names from the aux table + aux_dict = pip_aux_tables(wb_api, table="countries") + df_countries = aux_dict["countries"] + + # Merge the two files to get country names + df_percentiles_published = pd.merge( + df_percentiles_published, + df_countries[["country_code", "country_name"]], + on="country_code", + how="left", + ) + + # Rename columns + df_percentiles_published = df_percentiles_published.rename( + columns={ + "country_name": "country", + "percentile": "target_percentile", + "avg_welfare": "avg", + "welfare_share": "share", + "quantile": "thr", + } + ) + + # Drop pop_share + df_percentiles_published = df_percentiles_published.drop(columns=["pop_share"]) + + # Make thr null if target_percentile is 100 + df_percentiles_published.loc[df_percentiles_published["target_percentile"] == 100, "thr"] = np.nan + + # Add ppp_version column + df_percentiles_published["ppp_version"] = year + + return df_percentiles_published + + +def generate_consolidated_percentiles(df, wb_api: WB_API): + """ + Generates percentiles from the raw data. This is the final file with percentiles. + """ + start_time = time.time() + + path_file_percentiles = f"{CACHE_DIR}/pip_percentiles_before_checks.csv" + + if Path(path_file_percentiles).is_file(): + log.info("Percentiles file already exists. No need to consolidate.") + df_percentiles = pd.read_csv(path_file_percentiles) + + else: + log.info("Consolidating percentiles") + + # Define percentiles, from 1 to 99 + percentiles = range(1, 100, 1) + df_percentiles = pd.DataFrame() + + # Estimate percentiles + dfs = [calculate_percentile(p, df) for p in percentiles] + + df_percentiles = pd.concat(dfs, ignore_index=True) + + log.info("Percentiles calculated and consolidated") + + # Rename headcount to estimated_percentile and poverty_line to thr + df_percentiles = df_percentiles.rename(columns={"headcount": "estimated_percentile", "poverty_line": "thr"}) # type: ignore + + # Add official percentiles from the World Bank Databank + df_percentiles_published_2011 = format_official_percentiles(2011, wb_api) + df_percentiles_published_2017 = format_official_percentiles(2017, wb_api) + + df_percentiles = pd.concat( + [df_percentiles, df_percentiles_published_2011, df_percentiles_published_2017], ignore_index=True + ) + + # Drop duplicates. Keep the second one (the official one) + df_percentiles = df_percentiles.drop_duplicates( + subset=["ppp_version", "country", "year", "reporting_level", "welfare_type", "target_percentile"], + keep="last", + ) + + # Sort by ppp_version, country, year, reporting_level, welfare_type and target_percentile + df_percentiles = df_percentiles.sort_values( + by=["ppp_version", "country", "year", "reporting_level", "welfare_type", "target_percentile"] + ) + + # Save to csv + df_percentiles.to_csv(f"{CACHE_DIR}/pip_percentiles_before_checks.csv", index=False) + + # SANITY CHECKS + df_percentiles = sanity_checks(df_percentiles) + + # Drop distance_to_p, estimated_percentile, country_code + df_percentiles = df_percentiles.drop(columns=["distance_to_p", "estimated_percentile", "country_code"]) + + # Rename target_percentile to percentile + df_percentiles = df_percentiles.rename(columns={"target_percentile": "percentile"}) + + # Save to csv + df_percentiles.to_csv(f"{CACHE_DIR}/pip_percentiles.csv", index=False) + + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + log.info(f"Percentiles calculated and checked. Execution time: {elapsed_time} seconds") + + return df_percentiles + + +def sanity_checks(df_percentiles): + """ + Run different sanity checks to the percentiles file. + """ + log.info("Starting sanity checks") + + # Count number of rows before checks + rows_before = len(df_percentiles) + + # Consecutive percentiles (1, 2, 3, etc) + # Create a column called check that is True if target_percentile is consecutive for each ppp_version, country, year, reporting_level, and welfare_type + df_percentiles["check"] = ( + df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[ + "target_percentile" + ].diff() + == 1 + ) + + # Replace check with True if target_percentile is 1 + df_percentiles.loc[df_percentiles["target_percentile"] == 1, "check"] = True + + # Assign the boolean value to the entire group + df_percentiles["check"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["check"].transform("all") + + # Define mask + mask = ~df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Percentiles are not consecutive! These distributions will not be used: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Drop faulty distributions + df_percentiles = df_percentiles[~mask].reset_index(drop=True) + + ############################################################################################################ + # Distance_to_p is higher than TOLERANCE_PERCENTILES + df_percentiles["check"] = df_percentiles["distance_to_p"] > TOLERANCE_PERCENTILES + + # Assign the boolean value to the entire group + df_percentiles["check"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["check"].transform("any") + + # Define mask + mask = df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Percentiles are not accurate! These distributions will not be used: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Drop faulty distributions + df_percentiles = df_percentiles[~mask].reset_index(drop=True) + + ############################################################################################################ + # Nulls for thr, avg and share for the entire group of ppp_version, country, year, reporting_level, and welfare_type + df_percentiles["check_thr"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["thr"].transform(lambda x: x.isnull().all()) + df_percentiles["check_avg"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["avg"].transform(lambda x: x.isnull().all()) + df_percentiles["check_share"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["share"].transform(lambda x: x.isnull().all()) + + df_percentiles["check"] = df_percentiles["check_thr"] & df_percentiles["check_avg"] & df_percentiles["check_share"] + + # Define mask + mask = df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""There are null values for thr, avg and share! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Drop distributions with null values for thr, avg and share + df_percentiles = df_percentiles[~mask].reset_index(drop=True) + + ############################################################################################################ + # Find negative values for thr + df_percentiles["check"] = df_percentiles["thr"] < 0 + + # Define mask + mask = df_percentiles["check"] + + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""There are negative values for thr! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where thr, avg and share are negative, by assigning 0 + df_percentiles.loc[mask, "thr"] = 0 + + ############################################################################################################ + # Find negative values for avg + df_percentiles["check"] = df_percentiles["avg"] < 0 + + # Define mask + mask = df_percentiles["check"] + + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""There are negative values for avg! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where thr, avg and share are negative, by assigning 0 + df_percentiles.loc[mask, "avg"] = 0 + + ############################################################################################################ + # Find negative values for share + df_percentiles["check"] = df_percentiles["share"] < 0 + + # Define mask + mask = df_percentiles["check"] + + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""There are negative values for share! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where thr, avg and share are negative, by assigning 0 + df_percentiles.loc[mask, "share"] = 0 + + ############################################################################################################ + # thr is increasing for each ppp_version, country, year, reporting_level, and welfare_type + df_percentiles["check"] = ( + df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[ + "thr" + ] + .diff() + .round(2) + >= 0 + ) + + # Replace check with True if thr is NaN + df_percentiles.loc[df_percentiles["thr"].isna(), "check"] = True + + # Replace check with True if target_percentile is 1 + df_percentiles.loc[(df_percentiles["target_percentile"] == 1), "check"] = True + + # Define mask + mask = ~df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Thresholds are not increasing! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where thr is not increasing, by repeating the previous thr + df_percentiles.loc[mask, "thr"] = df_percentiles.loc[mask, "thr"].shift(1) + + ############################################################################################################ + # avg is increasing for each ppp_version, country, year, reporting_level, and welfare_type + df_percentiles["check"] = ( + df_percentiles.groupby(["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False)[ + "avg" + ] + .diff() + .round(2) + >= 0 + ) + + # Replace check with True if avg is NaN + df_percentiles.loc[df_percentiles["avg"].isna(), "check"] = True + + # Replace check with True if target_percentile is 1 + df_percentiles.loc[(df_percentiles["target_percentile"] == 1), "check"] = True + + # Define mask + mask = ~df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Averages are not increasing! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where avg is not increasing, by repeating the previous avg + df_percentiles.loc[mask, "avg"] = df_percentiles.loc[mask, "avg"].shift(1) + + ############################################################################################################ + # Check that avg are between thresholds + # Create thr_lower, which is the threshold for the previous percentile + df_percentiles["thr_lower"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["thr"].shift(1) + df_percentiles["check"] = (round(df_percentiles["avg"] - df_percentiles["thr_lower"], 2) >= 0) & ( + round(df_percentiles["thr"] - df_percentiles["avg"]) >= 0 + ) + + # Assign True if target_percentile is 1 + df_percentiles.loc[df_percentiles["target_percentile"] == 1, "check"] = True + + # Assign True if target_percentile is 100 and avg is greater than thr_lower + df_percentiles.loc[ + (df_percentiles["target_percentile"] == 100) + & (round(df_percentiles["avg"] - df_percentiles["thr_lower"], 2) >= 0), + "check", + ] = True + + # Assign True if avg is null + df_percentiles.loc[df_percentiles["avg"].isnull(), "check"] = True + + # Assign the boolean value to the entire group + df_percentiles["check"] = df_percentiles.groupby( + ["ppp_version", "country", "year", "reporting_level", "welfare_type"], dropna=False + )["check"].transform("all") + + # Define mask + mask = ~df_percentiles["check"] + df_error = df_percentiles[mask].reset_index(drop=True).copy() + + if len(df_error) > 0: + log.warning( + f"""Averages are not between thresholds! These distributions need to be corrected: + {df_error[["ppp_version", "country", "year", "reporting_level", "welfare_type"]].drop_duplicates()}""" + ) + # Correct cases where avg is not between thresholds, by averaging the two thresholds + df_percentiles.loc[mask, "avg"] = (df_percentiles.loc[mask, "thr_lower"] + df_percentiles.loc[mask, "thr"]) / 2 + + # Drop check columns + df_percentiles = df_percentiles.drop(columns=["check", "check_thr", "check_avg", "check_share", "thr_lower"]) + + # Count number of rows after checks + rows_after = len(df_percentiles) + + log.info(f"Percentiles file generated. {rows_before - rows_after} rows have been deleted.") + + return df_percentiles + + +# GENERATE RELATIVE POVERTY INDICATORS FILE +# This is data not given directly by the query, but we can get it by calculating 40, 50, 60% of the median and query +# NOTE: Medians need to be patched first in order to get data for all country-years (there are several missing values) + + +def generate_relative_poverty(wb_api: WB_API): + """ + Generates relative poverty indicators from query results. Uses concurrent.futures to speed up the process. + """ + start_time = time.time() + + def get_relative_data(df_row, pct, versions): + """ + This function is structured in a way to make it work with concurrent.futures. + It checks if the country file related to the row exists. If not, it runs the query. + """ + if ~np.isnan(df_row["median"]): + if Path( + f"{CACHE_DIR}/pip_country_data/pip_country_{df_row['country_code']}_year_{df_row['year']}_povline_{int(round(df_row['median'] * pct))}_welfare_{df_row['welfare_type']}_rep_{df_row['reporting_level']}_fillgaps_{FILL_GAPS}_ppp_2017.csv" + ).is_file(): + return + else: + return pip_query_country( + wb_api, + popshare_or_povline="povline", + value=df_row["median"] * pct / 100, + versions=versions, + country_code=df_row["country_code"], + year=df_row["year"], + fill_gaps=FILL_GAPS, + welfare_type=df_row["welfare_type"], + reporting_level=df_row["reporting_level"], + ppp_version=2017, + download="true", + ) + + def concurrent_relative_function(df): + """ + This is the main function to make concurrency work for country data. + """ + # Make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_country_data").mkdir(parents=True, exist_ok=True) + with ThreadPool(MAX_WORKERS) as pool: + tasks = [(df.iloc[i], pct, versions) for pct in [40, 50, 60] for i in range(len(df))] + pool.starmap(get_relative_data, tasks) + + def get_relative_data_region(df_row, pct, versions): + """ + This function is structured in a way to make it work with concurrent.futures. + It checks if the regional file related to the row exists. If not, it runs the query. + """ + if ~np.isnan(df_row["median"]): + if Path( + f"{CACHE_DIR}/pip_region_data/pip_region_{df_row['country_code']}_year_{df_row['year']}_povline_{int(round(df_row['median']*pct))}_ppp_2017.csv" + ).is_file(): + return + else: + return pip_query_region( + wb_api, + popshare_or_povline="povline", + value=df_row["median"] * pct / 100, + versions=versions, + country_code=df_row["country_code"], + year=df_row["year"], + welfare_type="all", + reporting_level="all", + ppp_version=2017, + download="true", + ) + + def concurrent_relative_region_function(df): + """ + This is the main function to make concurrency work for regional data. + """ + # Make sure the directory exists. If not, create it + Path(f"{CACHE_DIR}/pip_region_data").mkdir(parents=True, exist_ok=True) + with ThreadPool(int(round(MAX_WORKERS / 2))) as pool: + tasks = [(df.iloc[i], pct, versions) for pct in [40, 50, 60] for i in range(len(df))] + pool.starmap(get_relative_data_region, tasks) + + def add_relative_indicators(df, country_or_region): + """ + Integrates the relative indicators to the df. + """ + for pct in [40, 50, 60]: + # Initialize lists + headcount_ratio_list = [] + pgi_list = [] + pov_severity_list = [] + watts_list = [] + for i in range(len(df)): + if ~np.isnan(df["median"].iloc[i]): + if country_or_region == "country": + # Here I check if the file exists even after the original extraction. If it does, I read it. If not, I start the queries again. + file_path = f"{CACHE_DIR}/pip_country_data/pip_country_{df.iloc[i]['country_code']}_year_{df.iloc[i]['year']}_povline_{int(round(df.iloc[i]['median']*pct))}_welfare_{df.iloc[i]['welfare_type']}_rep_{df.iloc[i]['reporting_level']}_fillgaps_{FILL_GAPS}_ppp_2017.csv" + if Path(file_path).is_file(): + results = pd.read_csv(file_path) + else: + # Run the main function to get the data + get_relative_data(df.iloc[i], pct, versions) + results = pd.read_csv(file_path) + + elif country_or_region == "region": + # Here I check if the file exists even after the original extraction. If it does, I read it. If not, I start the queries again. + file_path = f"{CACHE_DIR}/pip_region_data/pip_region_{df.iloc[i]['country_code']}_year_{df.iloc[i]['year']}_povline_{int(round(df.iloc[i]['median']*pct))}_ppp_2017.csv" + if Path(file_path).is_file(): + results = pd.read_csv(file_path) + else: + # Run the main function to get the data + get_relative_data_region(df.iloc[i], pct, versions) + results = pd.read_csv(file_path) + else: + raise ValueError("country_or_region must be 'country' or 'region'") + + headcount_ratio_value = results["headcount"].iloc[0] + headcount_ratio_list.append(headcount_ratio_value) + + pgi_value = results["poverty_gap"].iloc[0] + pgi_list.append(pgi_value) + + pov_severity_value = results["poverty_severity"].iloc[0] + pov_severity_list.append(pov_severity_value) + + watts_value = results["watts"].iloc[0] + watts_list.append(watts_value) + + else: + headcount_ratio_list.append(np.nan) + pgi_list.append(np.nan) + pov_severity_list.append(np.nan) + watts_list.append(np.nan) + + # Add the lists as columns to the df + df[f"headcount_ratio_{pct}_median"] = headcount_ratio_list + df[f"poverty_gap_index_{pct}_median"] = pgi_list + df[f"poverty_severity_{pct}_median"] = pov_severity_list + df[f"watts_{pct}_median"] = watts_list + + return df + + # Obtain versions + versions = pip_versions(wb_api) + + # FOR COUNTRIES + # Get data from the most common query + df_country = pip_query_country( + wb_api, + popshare_or_povline="povline", + value=2.15, + versions=versions, + country_code="all", + year="all", + fill_gaps=FILL_GAPS, + welfare_type="all", + reporting_level="all", + ppp_version=2017, + ) + + # Patch medians + df_country = median_patch(df_country, country_or_region="country") + + # Run the main function to get the data + concurrent_relative_function(df_country) + + # Add relative indicators from the results above + df_country = add_relative_indicators(df=df_country, country_or_region="country") + + # FOR REGIONS + # Get data from the most common query + df_region = pip_query_region( + wb_api, + popshare_or_povline="povline", + value=2.15, + versions=versions, + country_code="all", + year="all", + welfare_type="all", + reporting_level="all", + ppp_version=2017, + ) + + # Patch medians + df_region = median_patch(df_region, country_or_region="region") + + # Run the main function to get the data + concurrent_relative_region_function(df_region) + + # Add relative indicators from the results above + df_region = add_relative_indicators(df=df_region, country_or_region="region") + + # Concatenate df_country and df_region + df = pd.concat([df_country, df_region], ignore_index=True) + + # Save to csv + df.to_csv(f"{CACHE_DIR}/pip_relative.csv", index=False) + + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + log.info(f"Relative poverty indicators calculated. Execution time: {elapsed_time} seconds") + + return df + + +# GENERATE MAIN INDICATORS FILE + + +def generate_key_indicators(wb_api: WB_API): + """ + Generate the main indicators file, from a set of poverty lines and PPP versions. Uses concurrent.futures to speed up the process. + """ + start_time = time.time() + + def get_country_data(povline, ppp_version, versions): + """ + This function is defined inside the main function because it needs to be called by concurrent.futures. + For country data. + """ + return pip_query_country( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code="all", + year="all", + fill_gaps=FILL_GAPS, + welfare_type="all", + reporting_level="all", + ppp_version=ppp_version, + download="false", + ) + + def get_region_data(povline, ppp_version, versions): + """ + This function is defined inside the main function because it needs to be called by concurrent.futures. + For regional data. + """ + return pip_query_region( + wb_api, + popshare_or_povline="povline", + value=povline / 100, + versions=versions, + country_code="all", + year="all", + welfare_type="all", + reporting_level="all", + ppp_version=ppp_version, + download="false", + ) + + def concurrent_function(): + """ + This function makes concurrency work for country data. + """ + with ThreadPool(MAX_WORKERS) as pool: + tasks = [ + (povline, ppp_version, versions) + for ppp_version, povlines in POVLINES_DICT.items() + for povline in povlines + ] + results = pool.starmap(get_country_data, tasks) + + # Concatenate list of dataframes + results = pd.concat(results, ignore_index=True) + + return results + + def concurrent_region_function(): + """ + This function makes concurrency work for regional data. + """ + with ThreadPool(int(round(MAX_WORKERS / 2))) as pool: + tasks = [ + (povline, ppp_version, versions) + for ppp_version, povlines in POVLINES_DICT.items() + for povline in povlines + ] + results = pool.starmap(get_region_data, tasks) + + # Concatenate list of dataframes + results = pd.concat(results, ignore_index=True) + + return results + + # Obtain latest versions of the PIP dataset + versions = pip_versions(wb_api) + + # Run the main function + results = concurrent_function() + results_region = concurrent_region_function() + + # If country is nan but country_code is TWN, replace country with Taiwan, China + results.loc[results["country"].isnull() & (results["country_code"] == "TWN"), "country"] = "Taiwan, China" + + # I check if the set of countries is the same in the df and in the aux table (list of countries) + aux_dict = pip_aux_tables(wb_api, table="countries") + assert set(results["country"].unique()) == set(aux_dict["countries"]["country_name"].unique()), log.fatal( + f"List of countries is not the same! Differences: {set(results['country'].unique()) - set(aux_dict['countries']['country_name'].unique())}" + ) + + # I check if the set of regions is the same in the df and in the aux table (list of regions) + aux_dict = pip_aux_tables(wb_api, table="regions") + assert set(results_region["country"].unique()) == set(aux_dict["regions"]["region"].unique()), log.fatal( + f"List of regions is not the same! Differences: {set(results_region['country'].unique()) - set(aux_dict['regions']['region'].unique())}" + ) + + # Concatenate df_country and df_region + df = pd.concat([results, results_region], ignore_index=True) + + # Sort ppp_version, country, year and poverty_line + df = df.sort_values(by=["ppp_version", "country", "year", "poverty_line"]) # type: ignore + + # Save to csv + df.to_csv(f"{CACHE_DIR}/pip_raw.csv", index=False) + + end_time = time.time() + elapsed_time = round(end_time - start_time, 2) + log.info(f"Key indicators calculated. Execution time: {elapsed_time} seconds") + + return df + + +def median_patch(df, country_or_region): + """ + Patch missing values in the median column. + PIP queries do not return all the medians, so they are patched with the results of the percentile file. + """ + + # Read percentile file + df_percentiles = pd.read_csv(f"{CACHE_DIR}/pip_percentiles.csv") + + # In df_percentiles, keep only the rows with percentile = 50 + df_percentiles = df_percentiles[df_percentiles["percentile"] == 50].reset_index() + + # If I want to patch the median for regions, I need to drop reporting_level and welfare_type columns + if country_or_region == "country": + # Merge df and df_percentiles + df = pd.merge( + df, + df_percentiles[["ppp_version", "country", "year", "reporting_level", "welfare_type", "thr"]], + on=["ppp_version", "country", "year", "reporting_level", "welfare_type"], + how="left", + ) + + # Replace missing values in median with thr + df["median"] = df["median"].fillna(df["thr"]) + + # Drop thr column + df = df.drop(columns=["thr"]) + + elif country_or_region == "region": + # Merge df and df_percentiles + df = pd.merge( + df, + df_percentiles[["ppp_version", "country", "year", "thr"]], + on=["ppp_version", "country", "year"], + how="left", + ) + + # Rename thr to median + df = df.rename(columns={"thr": "median"}) + + else: + raise ValueError("country_or_region must be 'country' or 'region'") + + log.info("Medians patched!") + + return df + + +def add_relative_poverty_and_decile_threholds(df, df_relative, df_percentiles): + """ + Add relative poverty indicators and decile thresholds to the key indicators file. + """ + + # Add relative poverty indicators + # They don't change with the PPP version, so we can use the 2017 version I estimated before. + df = pd.merge( + df, + df_relative[ + [ + "country", + "year", + "reporting_level", + "welfare_type", + "headcount_ratio_40_median", + "poverty_gap_index_40_median", + "poverty_severity_40_median", + "watts_40_median", + "headcount_ratio_50_median", + "poverty_gap_index_50_median", + "poverty_severity_50_median", + "watts_50_median", + "headcount_ratio_60_median", + "poverty_gap_index_60_median", + "poverty_severity_60_median", + "watts_60_median", + ] + ], + on=["country", "year", "reporting_level", "welfare_type"], + how="left", + ) + + # In df_percentiles, keep only the rows with percentile = 10, 20, 30, ... 90 + df_percentiles = df_percentiles[ + (df_percentiles["percentile"] % 10 == 0) & (df_percentiles["percentile"] != 100) + ].reset_index() + + # Make tb_percentile wide, with percentile as columns + df_percentiles = df_percentiles.pivot( + index=["ppp_version", "country", "year", "reporting_level", "welfare_type"], + columns="percentile", + values="thr", + ) + + # Rename columns + df_percentiles.columns = ["decile" + str(int(round(col / 10))) + "_thr" for col in df_percentiles.columns] + + # Reset index + df_percentiles = df_percentiles.reset_index() + + # Merge df and df_percentiles + df = pd.merge( + df, + df_percentiles, + on=["ppp_version", "country", "year", "reporting_level", "welfare_type"], + how="left", + ) + + # Save key indicators file + df.to_csv(f"{CACHE_DIR}/world_bank_pip.csv", index=False) + + log.info("Relative poverty indicators and decile thresholds added. Key indicators file done :)") + + return df + + +if __name__ == "__main__": + run() diff --git a/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc new file mode 100644 index 00000000000..4e5434ee522 --- /dev/null +++ b/snapshots/wb/2024-03-27/world_bank_pip.csv.dvc @@ -0,0 +1,31 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: World Bank Poverty and Inequality Platform (PIP) + description: |- + The Poverty and Inequality Platform (PIP) is an interactive computational tool that offers users quick access to the World Bank’s estimates of poverty, inequality, and shared prosperity. PIP provides a comprehensive view of global, regional, and country-level trends for more than 160 economies around the world. + date_published: 2024-03-26 + version_producer: 20240326_2017, 20240326_2011 + title_snapshot: Key indicators + + # Citation + producer: World Bank Poverty and Inequality Platform + citation_full: |- + World Bank (2024). Poverty and Inequality Platform (version 20240326_2017 and 20240326_2011) [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed March 27, 2024. + + # Files + url_main: https://pip.worldbank.org + date_accessed: 2024-03-27 + + # License + license: + name: CC0 + url: https://datacatalog.worldbank.org/search/dataset/0063646 + +wdir: ../../../data/snapshots/wb/2024-01-17 +outs: + - md5: 5fb032d2de430f79f25e1bdf1259c9bf + size: 35764784 + path: world_bank_pip.csv diff --git a/snapshots/wb/2024-03-27/world_bank_pip.py b/snapshots/wb/2024-03-27/world_bank_pip.py new file mode 100644 index 00000000000..c3db74c58bc --- /dev/null +++ b/snapshots/wb/2024-03-27/world_bank_pip.py @@ -0,0 +1,36 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wb/{SNAPSHOT_VERSION}/world_bank_pip.csv") + + # Ensure destination folder exists. + snap.path.parent.mkdir(exist_ok=True, parents=True) + + # Copy local data file to snapshots data folder. + snap.path.write_bytes(Path(path_to_file).read_bytes()) + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc new file mode 100644 index 00000000000..d7c1982d021 --- /dev/null +++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.csv.dvc @@ -0,0 +1,33 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/origin/ +meta: + origin: + # Data product / Snapshot + title: 'World Bank Poverty and Inequality Platform (PIP): Percentiles' + description: |- + The Poverty and Inequality Platform: Percentiles database reports 100 points ranked according to the consumption or income distributions for country-year survey data available in the World Bank’s Poverty and Inequality Platform (PIP). There are, as of March 26, 2024, a total of 2,367 country-survey-year data points, which include 2,201 distributions based on microdata or binned data, and 166 based on grouped data. For the grouped data, the percentiles are derived by fitting a parametric Lorenz distribution following Datt (1998). For ease of communication, all distributions are referred to as survey data henceforth, and the welfare variable is referred to as income. + + We modified the original files available in [World Bank's Databank](https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles) to include distributions from missing countries and regions with data available in PIP's API. + date_published: 2024-04-08 + version_producer: Version 10 + + # Citation + producer: World Bank Poverty and Inequality Platform + citation_full: |- + - World Bank (2024). Poverty and Inequality Platform: percentiles [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed 09 April 2024. + - World Bank (2024). Poverty and Inequality Platform (version 20240326_2017 and 20240326_2011) [Data set]. World Bank Group. https://pip.worldbank.org/. Accessed March 27, 2024. + + # Files + url_main: https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles + date_accessed: 2024-04-09 + + # License + license: + name: CC0 + url: https://datacatalog.worldbank.org/search/dataset/0063646/_poverty_and_inequality_platform_pip_percentiles + +wdir: ../../../data/snapshots/wb/2024-01-17 +outs: + - md5: f5bb53372a6fd0f563d20d04b3c897c7 + size: 49972432 + path: world_bank_pip_percentiles.csv diff --git a/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py new file mode 100644 index 00000000000..17eb2bd88e3 --- /dev/null +++ b/snapshots/wb/2024-03-27/world_bank_pip_percentiles.py @@ -0,0 +1,25 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"wb/{SNAPSHOT_VERSION}/world_bank_pip_percentiles.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() From ce3db21db3ad8147f2492034ab089f34413d233f Mon Sep 17 00:00:00 2001 From: owidbot Date: Tue, 16 Apr 2024 04:01:59 +0000 Subject: [PATCH 25/40] :robot: automatic excess mortality update --- snapshots/climate/latest/weekly_wildfires.csv.dvc | 11 ++++------- snapshots/excess_mortality/latest/hmd_stmf.csv.dvc | 8 ++++---- snapshots/excess_mortality/latest/wmd.csv.dvc | 2 +- .../latest/xm_karlinsky_kobak.csv.dvc | 2 +- .../latest/xm_karlinsky_kobak_ages.csv.dvc | 2 +- 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index 1e9fbe61c30..d767fe6c0d4 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -5,16 +5,13 @@ meta: description: |- The dataset provides a weekly comprehensive overview of fire activity and its environmental impact, incorporating data from the Global Wildfire Information System (GWIS) and satellite imagery from MODIS and VIIRS. It includes metrics such as the area of land burnt, cumulative burnt areas, carbon dioxide emissions from fires, cumulative carbon emissions, the number of fires, and cumulative fire counts. title_snapshot: Seasonal wildfire trends (2024 and later) - description_snapshot: This dataset focuses specifically on older data. A separate snapshot will be created to add more recent data. + description_snapshot: This dataset focuses specifically on older data. A separate snapshot will be created to add more + recent data. citation_full: Global Wildfire Information System attribution_short: GWIS url_main: https://gwis.jrc.ec.europa.eu/apps/gwis.statistics/seasonaltrend - date_accessed: 2024-04-15 - date_published: 2024-04-15 + date_accessed: 2024-04-16 + date_published: 2024-04-16 license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license -outs: - - md5: 06757d4e2324d884c119b0a8c419e896 - size: 11650883 - path: weekly_wildfires.csv diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index e17ec668748..792d0249d31 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,8 +13,8 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-04-15 - publication_date: 2024-03-18 + date_accessed: 2024-04-16 + publication_date: 2024-04-15 publication_year: 2024 published_by: |- HMD. Human Mortality Database. Max Planck Institute for Demographic Research (Germany), University of California, Berkeley (USA), and French Institute for Demographic Studies (France). Available at www.mortality.org. @@ -33,6 +33,6 @@ meta: name: Creative Commons BY 4.0 url: https://www.mortality.org/Data/UserAgreement outs: - - md5: 486998231b386472ce10076bc0bb9267 - size: 20580398 + - md5: 862fd3a084100dc88101927fb51a216d + size: 20643551 path: hmd_stmf.csv diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index cc60e04075b..2d5a545076e 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-04-15 + date_accessed: 2024-04-16 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 3624b44c904..63f22023dde 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-04-15 + date_accessed: 2024-04-16 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index e5bf7ad7629..176b2188ee4 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-04-15 + date_accessed: 2024-04-16 publication_date: '2021-06-30' publication_year: 2021 published_by: |- From affcd1303065a6f2eed52410d06cae8c452567dc Mon Sep 17 00:00:00 2001 From: owidbot Date: Tue, 16 Apr 2024 04:03:41 +0000 Subject: [PATCH 26/40] :robot: automatic flunet update --- snapshots/who/latest/fluid.csv.dvc | 4 ++-- snapshots/who/latest/flunet.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index b688b891b6e..7b7183d377c 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: 81d668993ca1dba5c2dd9feeb5b82218 - size: 150812561 + - md5: 09fc6d25ff6263883268a4af7ad5b43f + size: 150817870 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index 059c1a79e02..221e5db9feb 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: c330a107ff283f6862d1775b81b2a3bf - size: 25796571 + - md5: 1cd9f9e33c60e711bd3ce852d8ed9a95 + size: 25798090 path: flunet.csv From 99ef0d68f6720cfb6d308292a0681038411cc59f Mon Sep 17 00:00:00 2001 From: Marigold Date: Tue, 16 Apr 2024 08:05:03 +0200 Subject: [PATCH 27/40] :bug: save wildfires snapshot only after successful download --- snapshots/climate/latest/weekly_wildfires.csv.dvc | 4 ++++ snapshots/climate/latest/weekly_wildfires.py | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/snapshots/climate/latest/weekly_wildfires.csv.dvc b/snapshots/climate/latest/weekly_wildfires.csv.dvc index d767fe6c0d4..374bbe18307 100644 --- a/snapshots/climate/latest/weekly_wildfires.csv.dvc +++ b/snapshots/climate/latest/weekly_wildfires.csv.dvc @@ -15,3 +15,7 @@ meta: license: name: CC BY 4.0 url: https://gwis.jrc.ec.europa.eu/about-gwis/data-license +outs: + - md5: 06757d4e2324d884c119b0a8c419e896 + size: 11650883 + path: weekly_wildfires.csv diff --git a/snapshots/climate/latest/weekly_wildfires.py b/snapshots/climate/latest/weekly_wildfires.py index 4904b959a81..8755e613afa 100644 --- a/snapshots/climate/latest/weekly_wildfires.py +++ b/snapshots/climate/latest/weekly_wildfires.py @@ -50,9 +50,6 @@ def main(upload: bool) -> None: # Initialize a new snapshot object for storing data, using a predefined file path structure. snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/weekly_wildfires.csv") - # Add date_accessed - snap = modify_metadata(snap) - # Initialize an empty list to hold DataFrames for wildfire data. dfs_fires = [] @@ -170,6 +167,9 @@ def main(upload: bool) -> None: # Save the final DataFrame to the specified file path in the snapshot. df_to_file(df_final, file_path=snap.path) + # Add date_accessed + snap = modify_metadata(snap) + # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter. snap.dvc_add(upload=upload) From ec135789fcc56d1b2b4929d48f13a3f791a516d7 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 16 Apr 2024 17:59:27 +0200 Subject: [PATCH 28/40] Fix concat issue in new pandas version (#2527) --- lib/catalog/owid/catalog/tables.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py index db04e2e914a..150d44922cd 100644 --- a/lib/catalog/owid/catalog/tables.py +++ b/lib/catalog/owid/catalog/tables.py @@ -1215,6 +1215,26 @@ def concat( **kwargs, ) ) + ################################################################################################################ + # In pandas 2.2.1, pd.concat() does not return a copy when one of the input dataframes is empty. + # This causes the following unexpected behavior: + # df_0 = pd.DataFrame({"a": ["original value"]}) + # df_1 = pd.concat([pd.DataFrame(), df_0], ignore_index=True) + # df_0.loc[:, "a"] = "new value" + # df_1["a"] # This will return "new value" instead of "original value". + # In pandas `1.4.0`, the behavior was as expected (returning "original value"). + # Note that this happens even if `copy=True` is passed to `pd.concat()`. + if any([len(obj) == 0 for obj in objs]): + if pd.__version__ != "2.2.1": + # Check if patch is no longer needed. + df_0 = pd.DataFrame({"a": ["original value"]}) + df_1 = pd.concat([pd.DataFrame(), df_0], ignore_index=True) + df_0.loc[:, "a"] = "new value" + if df_1["a"].item() != "new value": + log.warning("Remove patch in owid.catalog.tables.concat, which is no longer necessary.") + # Ensure concat returns a copy. + table = table.copy() + ################################################################################################################ if (axis == 1) or (axis == "columns"): # Original function pd.concat allows returning a dataframe with multiple columns with the same name. From 86d0712f748d56e17ddc5c1333955e8c8ae2fcd5 Mon Sep 17 00:00:00 2001 From: Fiona Spooner Date: Tue, 16 Apr 2024 17:55:29 +0100 Subject: [PATCH 29/40] =?UTF-8?q?=E2=9C=A8=20=20Adding=20WHO=20regions=20t?= =?UTF-8?q?o=20regions.yml=20(#2525)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * adding WHO regions * changing region type to who_regiona * changing who_region to aggregate * Fix bad region definition, and ensure is_historical is boolean in regions garden step * only keep owid defined regions * adding pablo's suggestions --------- Co-authored-by: Pablo Rosado Co-authored-by: Marigold --- .../2023-06-14/ai_national_strategy.py | 4 +- .../data/garden/regions/2023-01-01/regions.py | 6 + .../garden/regions/2023-01-01/regions.yml | 224 ++++++++++++++++++ 3 files changed, 233 insertions(+), 1 deletion(-) diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py index ad708f5e3cb..913118d3c60 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py @@ -21,7 +21,9 @@ def run(dest_dir: str) -> None: ds_meadow = cast(Dataset, paths.load_dependency("ai_national_strategy")) # Load region dataset to find all possible countries and later fill the ones that don't exist in the spreadsheet as not released (according to source that's the implication) ds_regions = cast(Dataset, paths.load_dependency("regions")) - countries_national_ai = pd.DataFrame(ds_regions["regions"]["name"]) + tb_regions = ds_regions["regions"] + tb_regions = tb_regions[tb_regions["defined_by"] == "owid"] + countries_national_ai = pd.DataFrame(tb_regions["name"]) countries_national_ai.reset_index(drop=True, inplace=True) countries_national_ai["released"] = np.NaN # Generate the column names from "2017" to "2022" diff --git a/etl/steps/data/garden/regions/2023-01-01/regions.py b/etl/steps/data/garden/regions/2023-01-01/regions.py index 1fb987a4b4a..086ea166032 100644 --- a/etl/steps/data/garden/regions/2023-01-01/regions.py +++ b/etl/steps/data/garden/regions/2023-01-01/regions.py @@ -53,6 +53,9 @@ def parse_raw_definitions(df: pd.DataFrame) -> pd.DataFrame: def run_sanity_checks(df: pd.DataFrame) -> None: + # Check that all regions have a name. + assert df[df["name"].isnull()].empty, f"Some regions do not have a name: {set(df[df['name'].isnull()]['code'])}" + # Check that there are no repeated codes. duplicated_codes = df[df["code"].duplicated()]["code"].tolist() assert len(duplicated_codes) == 0, f"Duplicated codes found: {duplicated_codes}" @@ -125,6 +128,9 @@ def run(dest_dir: str) -> None: lambda x: json.dumps(sum(list(x), [])) if pd.notna(x.values) else x ) + # Ensure "is_historical" is boolean. + tb_regions = tb_regions.astype({"is_historical": bool}) + # Set an appropriate index and sort conveniently. tb_regions = tb_regions.set_index("code", verify_integrity=True).sort_index() diff --git a/etl/steps/data/garden/regions/2023-01-01/regions.yml b/etl/steps/data/garden/regions/2023-01-01/regions.yml index 9c48b522abd..2239ada2813 100644 --- a/etl/steps/data/garden/regions/2023-01-01/regions.yml +++ b/etl/steps/data/garden/regions/2023-01-01/regions.yml @@ -1847,3 +1847,227 @@ end_year: 1902 successors: - "ZAF" + +# WHO regions +- code: WHO_AMR + name: "Americas (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "ATG" + - "ARG" + - "BHS" + - "BRB" + - "BOL" + - "BRA" + - "CAN" + - "CHL" + - "COL" + - "CRI" + - "CUB" + - "DMA" + - "DOM" + - "ECU" + - "SLV" + - "GRD" + - "GTM" + - "HTI" + - "HND" + - "JAM" + - "MEX" + - "NIC" + - "PAN" + - "PRY" + - "PER" + - "KNA" + - "LCA" + - "SUR" + - "TTO" + - "USA" + - "URY" + - "VEN" + +- code: WHO_AFR + name: "Africa (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "DZA" + - "AGO" + - "BEN" + - "BWA" + - "BFA" + - "BDI" + - "CMR" + - "CPV" + - "CAF" + - "TCD" + - "COM" + - "COG" + - "CIV" + - "COD" + - "GNQ" + - "ERI" + - "SWZ" + - "ETH" + - "GAB" + - "GMB" + - "GHA" + - "GIN" + - "GNB" + - "KEN" + - "LSO" + - "LBR" + - "MDG" + - "MWI" + - "MLI" + - "MRT" + - "MUS" + - "MOZ" + - "NAM" + - "NER" + - "NGA" + - "RWA" + - "STP" + - "SEN" + - "SYC" + - "SLE" + - "ZAF" + - "SSD" + - "TZA" + - "TGO" + - "UGA" + - "ZMB" + - "ZWE" +- code: WHO_EMR + name: "Eastern Mediterranean (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "AFG" + - "BHR" + - "DJI" + - "EGY" + - "IRN" + - "IRQ" + - "JOR" + - "KWT" + - "LBN" + - "LBY" + - "MAR" + - "OMN" + - "PAK" + - "QAT" + - "SAU" + - "SOM" + - "SDN" + - "SYR" + - "TUN" + - "ARE" + - "YEM" +- code: WHO_EUR + name: "Europe (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "ALB" + - "AND" + - "ARM" + - "AUT" + - "AZE" + - "BLR" + - "BEL" + - "BIH" + - "BGR" + - "HRV" + - "CYP" + - "CZE" + - "DNK" + - "EST" + - "FIN" + - "FRA" + - "GEO" + - "DEU" + - "GRC" + - "HUN" + - "ISL" + - "IRL" + - "ISR" + - "ITA" + - "KAZ" + - "KGZ" + - "LVA" + - "LTU" + - "LUX" + - "MLT" + - "MDA" + - "MCO" + - "MNE" + - "NLD" + - "MKD" + - "NOR" + - "POL" + - "PRT" + - "ROU" + - "RUS" + - "SMR" + - "SRB" + - "SVK" + - "SVN" + - "ESP" + - "SWE" + - "CHE" + - "TJK" + - "TUR" + - "TKM" + - "UKR" + - "GBR" + - "UZB" +- code: WHO_SEAR + name: "South-East Asia (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "BGD" + - "BTN" + - "PRK" + - "IND" + - "IDN" + - "MDV" + - "MMR" + - "NPL" + - "THA" + - "TLS" + - "LKA" +- code: WHO_WPAC + name: "Western Pacific (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "AUS" + - "BRN" + - "KHM" + - "CHN" + - "COK" + - "FJI" + - "JPN" + - "KIR" + - "LAO" + - "MYS" + - "MHL" + - "FSM" + - "MNG" + - "NRU" + - "NZL" + - "NIU" + - "PLW" + - "PNG" + - "PHL" + - "WSM" + - "SGP" + - "SLB" + - "KOR" + - "TON" + - "TUV" + - "VUT" + - "VNM" From 01dba805763651b12d917ebfa5374ce72b57a994 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Tue, 16 Apr 2024 18:58:10 +0200 Subject: [PATCH 30/40] :sparkles: compute checksums from ingredients only (#2514) * :sparkles: compute checksums from ingredients only --- apps/owidbot/etldiff.py | 5 ++++- etl/steps/__init__.py | 10 +++------- .../garden/nasa/2023-03-06/ozone_hole_area.meta.yml | 3 --- .../war/2023-01-18/dunnigan_martel_1987.meta.yml | 1 - .../data/garden/war/2023-01-18/eckhardt_1991.meta.yml | 1 - .../data/garden/war/2023-01-18/kaye_1985.meta.yml | 1 - .../data/garden/war/2023-01-18/sutton_1971.meta.yml | 4 ---- 7 files changed, 7 insertions(+), 18 deletions(-) diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py index a721b43bd0f..504e53c48b6 100644 --- a/apps/owidbot/etldiff.py +++ b/apps/owidbot/etldiff.py @@ -55,6 +55,10 @@ def cli( nbranch = _normalise_branch(branch) if branch else "dry-run" + # TODO: only include site-screenshots if the PR is from owid-grapher. Similarly, don't + # run etl diff if the PR is from etl repo. + # - **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch} + body = f"""
@@ -63,7 +67,6 @@ def cli( - **Admin**: http://staging-site-{nbranch}/admin/login - **Site**: http://staging-site-{nbranch}/ - **Login**: `ssh owid@staging-site-{nbranch}` -- **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch}
diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py index 652cd42ac0a..b47f18ea2e8 100644 --- a/etl/steps/__init__.py +++ b/etl/steps/__init__.py @@ -518,7 +518,8 @@ def _output_dataset(self) -> catalog.Dataset: return catalog.Dataset(self._dest_dir.as_posix()) def checksum_output(self) -> str: - return self._output_dataset.checksum() + # output checksum is checksum of all ingredients + return self.checksum_input() def _step_files(self) -> List[str]: "Return a list of code files defining this step." @@ -714,12 +715,7 @@ def has_existing_data(self) -> bool: return True def checksum_output(self) -> str: - # NOTE: we could use the checksum from `_dvc_path` to - # speed this up. Test the performance on - # time poetry run etl run garden --dry-run - # Make sure that the checksum below is the same as DVC checksum! It - # looks like it might be different for some reason - return files.checksum_file(self._dvc_path) + return Snapshot(self.path).m.outs[0]["md5"] @property def _dvc_path(self) -> str: diff --git a/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml b/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml index dc0290db97b..7ccf28e173a 100644 --- a/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml +++ b/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml @@ -16,9 +16,6 @@ dataset: Minimum and mean Southern Hemisphere daily ozone concentrations, measured in Dobson Units (DU). This dataset should be next updated by the source every year. We will update it on Our World in Data soon after the new version is published. At the link above you can directly access the source page and see the latest available data. - licenses: - - name: # TO BE FILLED. Example: Testing License Name - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml b/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml index 03e7ade1190..d4722538299 100644 --- a/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml +++ b/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml @@ -17,7 +17,6 @@ dataset: This dataset provides information on military and civilian deaths from wars, drawn from the book by Dunnigan and Martel (1987). licenses: - name: Doubleday (1987) - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml b/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml index 790bc299ceb..7f77a80f561 100644 --- a/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml +++ b/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml @@ -17,7 +17,6 @@ dataset: This dataset provides information on military and civilian deaths from wars, drawn from the chapter by Eckhardt (1991). licenses: - name: World Priorities - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml b/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml index 2f4f3db8fed..3ca1fb6a42a 100644 --- a/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml +++ b/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml @@ -17,7 +17,6 @@ dataset: This dataset provides information on direct and indirect military and civilian deaths from major armed conflicts, drawn from the report by Kaye et al. (1985). licenses: - name: Department of National Defence, Canada, Operational Research and Analysis Establishment, 1985 - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml b/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml index 6a12a6987e2..540549a7a8a 100644 --- a/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml +++ b/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml @@ -4,7 +4,6 @@ all_sources: published_by: Sutton, Antony. 1972. Wars and Revolutions in the Nineteenth Century. Hoover Institution Archives. url: https://searchworks.stanford.edu/view/3023823 date_accessed: 2023-01-09 - publication_date: # TO BE FILLED. Example: 2023-01-01 publication_year: 1971 # description: Source description. @@ -15,9 +14,6 @@ dataset: version: 2023-01-18 description: | This dataset provides information on deaths from wars and revolutions, using data from Sutton (1972). - licenses: - - name: Unknown - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing From 9059031496f2a19a4c28ebd6460201bd9db5af03 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Tue, 16 Apr 2024 23:16:24 +0200 Subject: [PATCH 31/40] :sparkles: add command sync.catalog for prefetching catalog from R2 to local (#2526) --- Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile b/Makefile index 4df3585ed72..376ff4272f1 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,7 @@ help: @echo ' make format-all Format code (including modules in lib/)' @echo ' make full Fetch all data and run full transformations' @echo ' make grapher Publish supported datasets to Grapher' + @echo ' make sync.catalog Sync catalog from R2 into local data/ folder' @echo ' make lab Start a Jupyter Lab server' @echo ' make publish Publish the generated catalog to S3' @echo ' make api Start the ETL API on port 8081' @@ -118,6 +119,14 @@ prune: .venv @echo '==> Prune datasets with no recipe from catalog' poetry run etl d prune +# Syncing catalog is useful if you want to avoid rebuilding it locally from scratch +# which could take a few hours. This will download ~10gb from the main channels +# (meadow, garden, open_numbers) and is especially useful when we increase ETL_EPOCH +# or update regions. +sync.catalog: .venv + @echo '==> Sync catalog from R2 into local data/ folder (~10gb)' + rclone sync owid-r2:owid-catalog/ data/ --verbose --fast-list --transfers=64 --checkers=64 --include "/meadow/**" --include "/garden/**" --include "/open_numbers/**" + grapher: .venv @echo '==> Running full etl with grapher upsert' poetry run etl run --grapher From 3f964929cff3a4242272ea187bce714049399bd8 Mon Sep 17 00:00:00 2001 From: owidbot Date: Wed, 17 Apr 2024 04:02:35 +0000 Subject: [PATCH 32/40] :robot: automatic excess mortality update --- snapshots/excess_mortality/latest/hmd_stmf.csv.dvc | 2 +- snapshots/excess_mortality/latest/wmd.csv.dvc | 2 +- snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc | 2 +- .../excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc index 792d0249d31..d09a7442446 100644 --- a/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc +++ b/snapshots/excess_mortality/latest/hmd_stmf.csv.dvc @@ -13,7 +13,7 @@ meta: HMD provides an online STMF visualization toolkit (https://mpidr.shinyapps.io/stmortality). url: https://www.mortality.org/Data/STMF source_data_url: https://www.mortality.org/File/GetDocument/Public/STMF/Outputs/stmf.csv - date_accessed: 2024-04-16 + date_accessed: 2024-04-17 publication_date: 2024-04-15 publication_year: 2024 published_by: |- diff --git a/snapshots/excess_mortality/latest/wmd.csv.dvc b/snapshots/excess_mortality/latest/wmd.csv.dvc index 2d5a545076e..65a2e1fe5e6 100644 --- a/snapshots/excess_mortality/latest/wmd.csv.dvc +++ b/snapshots/excess_mortality/latest/wmd.csv.dvc @@ -13,7 +13,7 @@ meta: Published paper available at https://elifesciences.org/articles/69336. url: https://github.com/akarlinsky/world_mortality/ source_data_url: https://raw.githubusercontent.com/akarlinsky/world_mortality/main/world_mortality.csv - date_accessed: 2024-04-16 + date_accessed: 2024-04-17 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc index 63f22023dde..07c24658aec 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak.csv.dvc @@ -7,7 +7,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-per-year.csv - date_accessed: 2024-04-16 + date_accessed: 2024-04-17 publication_date: '2021-06-30' publication_year: 2021 published_by: |- diff --git a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc index 176b2188ee4..2b4a5acb103 100644 --- a/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc +++ b/snapshots/excess_mortality/latest/xm_karlinsky_kobak_ages.csv.dvc @@ -6,7 +6,7 @@ meta: For more details, refer to https://github.com/dkobak/excess-mortality#excess-mortality-during-the-covid-19-pandemic. url: https://github.com/dkobak/excess-mortality source_data_url: https://raw.githubusercontent.com/dkobak/excess-mortality/main/baselines-stmf.csv - date_accessed: 2024-04-16 + date_accessed: 2024-04-17 publication_date: '2021-06-30' publication_year: 2021 published_by: |- From 43ae61173e73ad66030d911f7a5a65878a6eba98 Mon Sep 17 00:00:00 2001 From: owidbot Date: Wed, 17 Apr 2024 04:04:06 +0000 Subject: [PATCH 33/40] :robot: automatic flunet update --- snapshots/who/latest/fluid.csv.dvc | 4 ++-- snapshots/who/latest/flunet.csv.dvc | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/snapshots/who/latest/fluid.csv.dvc b/snapshots/who/latest/fluid.csv.dvc index 7b7183d377c..a2a7051f812 100644 --- a/snapshots/who/latest/fluid.csv.dvc +++ b/snapshots/who/latest/fluid.csv.dvc @@ -16,6 +16,6 @@ meta: The platform accommodates both qualitative and quantitative data which facilitates the tracking of global trends, spread, intensity, and impact of influenza. These data are made freely available to health policy makers in order to assist them in making informed decisions regarding the management of influenza. wdir: ../../../data/snapshots/who/latest outs: - - md5: 09fc6d25ff6263883268a4af7ad5b43f - size: 150817870 + - md5: e8ddbf642ef6d60c536db353c62147f4 + size: 150907130 path: fluid.csv diff --git a/snapshots/who/latest/flunet.csv.dvc b/snapshots/who/latest/flunet.csv.dvc index 221e5db9feb..1c40102dcf4 100644 --- a/snapshots/who/latest/flunet.csv.dvc +++ b/snapshots/who/latest/flunet.csv.dvc @@ -16,6 +16,6 @@ meta: The data are provided remotely by National Influenza Centres (NICs) of the Global Influenza Surveillance and Response System (GISRS) and other national influenza reference laboratories collaborating actively with GISRS, or are uploaded from WHO regional databases. wdir: ../../../data/snapshots/who/latest outs: - - md5: 1cd9f9e33c60e711bd3ce852d8ed9a95 - size: 25798090 + - md5: 0221c18fd18fdfe7b4510a3fea61459f + size: 25800809 path: flunet.csv From 092a1778feb5d666e75c58635da044ef4584f944 Mon Sep 17 00:00:00 2001 From: Fiona Spooner Date: Wed, 17 Apr 2024 08:12:43 +0100 Subject: [PATCH 34/40] Update who.meta.yml (#2529) --- etl/steps/data/garden/wash/2024-01-06/who.meta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/steps/data/garden/wash/2024-01-06/who.meta.yml b/etl/steps/data/garden/wash/2024-01-06/who.meta.yml index 8feb8a86f07..4e5167f58b9 100644 --- a/etl/steps/data/garden/wash/2024-01-06/who.meta.yml +++ b/etl/steps/data/garden/wash/2024-01-06/who.meta.yml @@ -25,7 +25,7 @@ definitions: safely_managed_sanitation_desc: &safely_managed_sanitation_desc | Safely managed sanitation services are defined as improved sanitation facilities that are not shared with other households and where excreta are safely disposed in situ or transported and treated off-site. basic_drinking_water_desc: &basic_drinking_water_desc | - Basic drinking water services are defined as an improved drinking water source,provided collection time is not more than 30 minutes for a roundtrip including queuing. + Basic drinking water services are defined as an improved drinking water source, provided collection time is not more than 30 minutes for a roundtrip including queuing. limited_drinking_water_desc: &limited_drinking_water_desc | Limited drinking water services are defined as drinking water from an improved source for which collection time exceeds 30 minutes for a roundtrip including queuing. improved_drinking_water_desc: &improved_drinking_water_desc | From 2e72a3ab35a51f7f4b8eb4a8090df77a3ce32235 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Wed, 17 Apr 2024 13:36:19 +0200 Subject: [PATCH 35/40] :bug: reuse engine when deleting ghost variables (#2530) * :bug: reuse engine when deleting ghost variables * :bug: fix creating missing entities * :hammer: deprecate DBUtils * :bug: fix container name --- apps/backport/datasync/data_metadata.py | 8 +- apps/owidbot/etldiff.py | 20 ++-- etl/chart_revision/v1/deprecated.py | 50 ++++----- etl/chart_revision/v1/revision.py | 30 ++--- etl/db.py | 29 ----- etl/db_utils.py | 142 ------------------------ etl/grapher_helpers.py | 52 +++++---- etl/grapher_import.py | 41 ++++--- etl/steps/__init__.py | 7 +- 9 files changed, 116 insertions(+), 263 deletions(-) delete mode 100644 etl/db_utils.py diff --git a/apps/backport/datasync/data_metadata.py b/apps/backport/datasync/data_metadata.py index 190a6d32a85..f1d4fe3e725 100644 --- a/apps/backport/datasync/data_metadata.py +++ b/apps/backport/datasync/data_metadata.py @@ -83,7 +83,13 @@ def add_entity_code_and_name(session: Session, df: pd.DataFrame) -> pd.DataFrame df["entityCode"] = [] return df - entities = _fetch_entities(session, list(df["entityId"].unique())) + unique_entities = df["entityId"].unique() + + entities = _fetch_entities(session, list(unique_entities)) + + if set(unique_entities) - set(entities.entityId): + missing_entities = set(unique_entities) - set(entities.entityId) + raise ValueError(f"Missing entities in the database: {missing_entities}") return pd.merge(df, entities, on="entityId") diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py index 504e53c48b6..266bcd816a6 100644 --- a/apps/owidbot/etldiff.py +++ b/apps/owidbot/etldiff.py @@ -1,6 +1,6 @@ import datetime as dt -import re import subprocess +import time from typing import Tuple import click @@ -10,7 +10,7 @@ from rich.ansi import AnsiDecoder from rich_click.rich_command import RichCommand -from apps.staging_sync.cli import _normalise_branch +from apps.staging_sync.cli import _get_container_name from etl import config from etl.paths import BASE_DIR @@ -50,10 +50,12 @@ def cli( $ python apps/owidbot/etldiff.py --branch my-branch ``` """ + t = time.time() + lines = call_etl_diff(include) diff, result = format_etl_diff(lines) - nbranch = _normalise_branch(branch) if branch else "dry-run" + container_name = _get_container_name(branch) if branch else "dry-run" # TODO: only include site-screenshots if the PR is from owid-grapher. Similarly, don't # run etl diff if the PR is from etl repo. @@ -64,9 +66,9 @@ def cli( Staging server: -- **Admin**: http://staging-site-{nbranch}/admin/login -- **Site**: http://staging-site-{nbranch}/ -- **Login**: `ssh owid@staging-site-{nbranch}` +- **Admin**: http://{container_name}/admin/login +- **Site**: http://{container_name}/ +- **Login**: `ssh owid@{container_name}`
@@ -81,6 +83,7 @@ def cli(
_Edited: {dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")} UTC_ +_Execution time: {time.time() - t:.2f} seconds_ """.strip() if dry_run: @@ -142,6 +145,7 @@ def format_etl_diff(lines: list[str]) -> Tuple[str, str]: diff = "\n".join(new_lines) + # NOTE: we don't need this anymore, we now have consistent checksums on local and remote # Some datasets might have different checksum, but be the same (this is caused by checksum_input and checksum_output # problem). Hotfix this by removing matching datasets from the output. # Example: @@ -152,8 +156,8 @@ def format_etl_diff(lines: list[str]) -> Tuple[str, str]: # ~ Column A # = Dataset grapher/agriculture/2024-03-26/attainable_yields # = Table attainable_yields - pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)" - diff = re.sub(pattern, "", diff) + # pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)" + # diff = re.sub(pattern, "", diff) return diff, result diff --git a/etl/chart_revision/v1/deprecated.py b/etl/chart_revision/v1/deprecated.py index fc47931c239..f4159b7f427 100644 --- a/etl/chart_revision/v1/deprecated.py +++ b/etl/chart_revision/v1/deprecated.py @@ -21,7 +21,7 @@ from tqdm import tqdm from etl.config import DEBUG, GRAPHER_USER_ID -from etl.db import open_db +from etl.db import get_engine from etl.grapher_helpers import IntRange log = structlog.get_logger() @@ -179,23 +179,23 @@ def _get_chart_update_reason(self, variable_ids: List[int]) -> str: Accesses DB and finds out the name of the recently added dataset with the new variables.""" try: - with open_db() as db: + with get_engine().connect() as con: if len(variable_ids) == 1: - results = db.fetch_many( + results = con.execute( f""" SELECT variables.name, datasets.name, datasets.version FROM datasets JOIN variables ON datasets.id = variables.datasetId WHERE variables.id IN ({variable_ids[0]}) """ - ) + ).fetchmany() else: - results = db.fetch_many( + results = con.execute( f""" SELECT variables.name, datasets.name, datasets.version FROM datasets JOIN variables ON datasets.id = variables.datasetId WHERE variables.id IN {*variable_ids,} """ - ) + ).fetchmany() except Exception: self.report_error( "Problem found when accessing the DB trying to get details on the newly added variables" @@ -220,10 +220,10 @@ def _get_chart_update_reason(self, variable_ids: List[int]) -> str: def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: n_before = 0 try: - with open_db() as db: - n_before = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0] + with get_engine().connect() as con: + n_before = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0] # type: ignore - res = db.fetch_many( + res = con.execute( """ SELECT * FROM ( @@ -235,7 +235,7 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: ) as grouped WHERE grouped.c > 1 """ - ) + ).fetchmany() if len(res): raise RuntimeError( "Two or more suggested chart revisions with status IN " @@ -267,13 +267,13 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: VALUES (%s, %s, %s, %s, %s, %s, NOW(), NOW()) """ - db.upsert_many(query, tuples) + con.execute(query, tuples) # checks if any of the affected chartIds now has multiple # pending suggested revisions. If so, then rejects the whole # insert and tell the user which suggested chart revisions need # to be approved/rejected. - res = db.fetch_many( + res = con.execute( f""" SELECT id, scr.chartId, c, createdAt FROM ( @@ -291,7 +291,7 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: WHERE grouped.c > 1 ORDER BY createdAt ASC """ - ) + ).fetchmany() if len(res): df = pd.DataFrame(res, columns=["id", "chart_id", "count", "created_at"]) df["drop"] = df.groupby("chart_id")["created_at"].transform(lambda gp: gp == gp.max()) @@ -321,8 +321,8 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: self.report_error(f"INSERT operation into `suggested_chart_revisions` cancelled. Error: {e}") raise e finally: - with open_db() as db: - n_after = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0] + with get_engine().connect() as con: + n_after = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0] # type: ignore self.report_info( f"{n_after - n_before} of {len(suggested_chart_revisions)} suggested chart revisions inserted." @@ -343,18 +343,18 @@ def _get_charts_from_old_variables( df_chart_dimensions: dataframe of chart_dimensions rows. df_chart_revisions: dataframe of chart_revisions rows. """ - with open_db() as db: + with get_engine().connect() as con: # retrieves chart_dimensions variable_ids = list(self.old_var_id2new_var_id.keys()) variable_ids_str = ",".join([str(_id) for _id in variable_ids]) columns = ["id", "chartId", "variableId", "property", "order"] - rows = db.fetch_many( + rows = con.execute( f""" SELECT {','.join([f'`{col}`' for col in columns])} FROM chart_dimensions WHERE variableId IN ({variable_ids_str}) """ - ) + ).fetchmany() df_chart_dimensions = pd.DataFrame(rows, columns=columns) # retrieves charts @@ -369,40 +369,40 @@ def _get_charts_from_old_variables( "lastEditedAt", "publishedAt", ] - rows = db.fetch_many( + rows = con.execute( f""" SELECT {','.join(columns)} FROM charts WHERE id IN ({chart_ids_str}) """ - ) + ).fetchmany() df_charts = pd.DataFrame(rows, columns=columns) # retrieves chart_revisions columns = ["id", "chartId", "userId", "config", "createdAt", "updatedAt"] - rows = db.fetch_many( + rows = con.execute( f""" SELECT {','.join(columns)} FROM chart_revisions WHERE chartId IN ({chart_ids_str}) """ - ) + ).fetchmany() df_chart_revisions = pd.DataFrame(rows, columns=columns) return df_charts, df_chart_dimensions, df_chart_revisions def _get_variable_year_ranges(self) -> Dict[int, List[int]]: - with open_db() as db: + with get_engine().connect() as con: all_var_ids = list(self.old_var_id2new_var_id.keys()) + list(self.old_var_id2new_var_id.values()) variable_ids_str = ",".join([str(_id) for _id in all_var_ids]) raise NotImplementedError("data_values was deprecated") - rows = db.fetch_many( + rows = con.execute( f""" SELECT variableId, MIN(year) AS minYear, MAX(year) AS maxYear FROM data_values WHERE variableId IN ({variable_ids_str}) GROUP BY variableId """ - ) + ).fetchmany() var_id2year_range = {} for variable_id, min_year, max_year in rows: var_id2year_range[variable_id] = [min_year, max_year] diff --git a/etl/chart_revision/v1/revision.py b/etl/chart_revision/v1/revision.py index 1cbe360c409..0346de1b9ee 100644 --- a/etl/chart_revision/v1/revision.py +++ b/etl/chart_revision/v1/revision.py @@ -15,7 +15,7 @@ from etl.chart_revision.v1.chart import Chart from etl.chart_revision.v1.variables import VariablesUpdate from etl.config import GRAPHER_USER_ID -from etl.db import get_engine, open_db +from etl.db import get_engine log = get_logger() # The maximum length of the suggested revision reason can't exceed the maximum length specified by the datatype "suggestedReason" in grapher.suggested_chart_revisions table. @@ -341,10 +341,10 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): """Submit chart revisions to Grapher.""" n_before = 0 try: - with open_db() as db: - n_before = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0] + with get_engine().connect() as con: + n_before = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0] # type: ignore - res = db.fetch_many( + res = con.execute( """ SELECT * FROM ( @@ -356,7 +356,7 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): ) as grouped WHERE grouped.c > 1 """ - ) + ).fetchmany() if len(res): raise RuntimeError( "Two or more suggested chart revisions with status IN " @@ -387,13 +387,13 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): VALUES (%s, %s, %s, %s, %s, %s, %s, NOW(), NOW()) """ - db.upsert_many(query, tuples) + con.execute(query, tuples) # checks if any of the affected chartIds now has multiple # pending suggested revisions. If so, then rejects the whole # insert and tell the user which suggested chart revisions need # to be approved/rejected. - res = db.fetch_many( + res = con.execute( f""" SELECT id, scr.chartId, c, createdAt FROM ( @@ -411,7 +411,7 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): WHERE grouped.c > 1 ORDER BY createdAt ASC """ - ) + ).fetchmany() if len(res): df = pd.DataFrame(res, columns=["id", "chart_id", "count", "created_at"]) df["drop"] = df.groupby("chart_id")["created_at"].transform(lambda gp: gp == gp.max()) @@ -441,8 +441,8 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): log.info(f"INSERT operation into `suggested_chart_revisions` cancelled. Error: {e}") raise e finally: - with open_db() as db: - n_after = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0] + with get_engine().connect() as con: + n_after = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0] # type: ignore log.info(f"{n_after - n_before} of {len(revisions)} suggested chart revisions inserted.") @@ -452,23 +452,23 @@ def _get_chart_update_reason(variable_ids: List[int]) -> str: Accesses DB and finds out the name of the recently added dataset with the new variables.""" try: - with open_db() as db: + with get_engine().connect() as con: if len(variable_ids) == 1: - results = db.fetch_many( + results = con.execute( f""" SELECT variables.name, datasets.name, datasets.version FROM datasets JOIN variables ON datasets.id = variables.datasetId WHERE variables.id IN ({variable_ids[0]}) """ - ) + ).fetchmany() else: - results = db.fetch_many( + results = con.execute( f""" SELECT variables.name, datasets.name, datasets.version FROM datasets JOIN variables ON datasets.id = variables.datasetId WHERE variables.id IN {*variable_ids,} """ - ) + ).fetchmany() except Exception: log.error( "Problem found when accessing the DB trying to get details on the newly added variables" diff --git a/etl/db.py b/etl/db.py index dcd13ba0e3c..cf07e1edc05 100644 --- a/etl/db.py +++ b/etl/db.py @@ -1,7 +1,4 @@ -import traceback import warnings -from collections.abc import Generator -from contextlib import contextmanager from typing import Any, Dict, List, Optional, cast from urllib.parse import quote @@ -14,7 +11,6 @@ from sqlmodel import Session from etl import config -from etl.db_utils import DBUtils log = structlog.get_logger() @@ -59,31 +55,6 @@ def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine: ) -@contextmanager -def open_db() -> Generator[DBUtils, None, None]: - connection = None - cursor = None - try: - connection = get_connection() - connection.autocommit(False) - cursor = connection.cursor() - yield DBUtils(cursor) - connection.commit() - except Exception as e: - log.error(f"Error encountered during import: {e}") - log.error("Rolling back changes...") - if connection: - connection.rollback() - if config.DEBUG: - traceback.print_exc() - raise e - finally: - if cursor: - cursor.close() - if connection: - connection.close() - - def get_dataset_id( dataset_name: str, db_conn: Optional[MySQLdb.Connection] = None, version: Optional[str] = None ) -> Any: diff --git a/etl/db_utils.py b/etl/db_utils.py deleted file mode 100644 index 35213052f3b..00000000000 --- a/etl/db_utils.py +++ /dev/null @@ -1,142 +0,0 @@ -"""This module was inspired by https://github.com/owid/importers/blob/master/db_utils.py. It is not meant -to be extended, but slowly replaced by etl/grapher_model.py""" - -from typing import Any, Dict, Iterable, List, Optional, Tuple, cast - -import structlog -from MySQLdb import IntegrityError -from MySQLdb.cursors import Cursor -from unidecode import unidecode - -log = structlog.get_logger() - -UNMODIFIED = 0 -INSERT = 1 -UPDATE = 2 - - -def normalize_entity_name(entity_name: str) -> str: - return unidecode(entity_name.strip()) - - -class NotOne(ValueError): - pass - - -class DBUtils: - def __init__(self, cursor: Cursor): - self.cursor = cursor - self.entity_id_by_normalised_name: Dict[str, int] = {} - - def get_entity_cache(self) -> Dict[str, int]: - return self.entity_id_by_normalised_name - - def fetch_one_or_none(self, *args: Any, **kwargs: Any) -> Any: - self.cursor.execute(*args, **kwargs) - rows = self.cursor.fetchall() - if len(rows) > 1: - raise NotOne("Expected 1 or 0 rows but received %d" % (len(rows))) - elif len(rows) == 1: - return rows[0] - else: - return None - - def fetch_one(self, *args: Any, **kwargs: Any) -> Any: - result = self.fetch_one_or_none(*args, **kwargs) - if result is None: - raise NotOne("Expected 1 row but received 0") - else: - return result - - def fetch_many(self, *args: Any, **kwargs: Any) -> List[Any]: - self.cursor.execute(*args, **kwargs) - return cast(List[Any], self.cursor.fetchall()) - - def insert_one(self, *args: Any, **kwargs: Any) -> int: - self.cursor.execute(*args, **kwargs) - return int(self.cursor.lastrowid) - - def upsert_one(self, *args: Any, **kwargs: Any) -> Optional[int]: - self.cursor.execute(*args, **kwargs) - if self.cursor.rowcount == 0: - return UNMODIFIED - if self.cursor.rowcount == 1: - return INSERT - if self.cursor.rowcount == 2: - return UPDATE - return None - - def upsert_many(self, query: str, tuples: Iterable[Tuple[Any, ...]]) -> None: - self.cursor.executemany(query, list(tuples)) - - def execute_until_empty(self, *args: Any, **kwargs: Any) -> None: - first = True - while first or self.cursor.rowcount > 0: - first = False - self.cursor.execute(*args, **kwargs) - - def __get_cached_entity_id(self, name: str) -> Optional[int]: - normalised_name = normalize_entity_name(name) - if normalised_name in self.entity_id_by_normalised_name: - return self.entity_id_by_normalised_name[normalised_name] - else: - return None - - def get_or_create_entity(self, name: str) -> int: - # Serve from cache if available - entity_id = self.__get_cached_entity_id(name) - if entity_id is not None: - return entity_id - # Populate cache from database - self.prefill_entity_cache([name]) - entity_id = self.__get_cached_entity_id(name) - if entity_id is not None: - return entity_id - # If still not in cache, it's a new entity and we have to insert it - else: - try: - self.upsert_one( - """ - INSERT INTO entities - (name, displayName, validated, createdAt, updatedAt) - VALUES - (%s, '', FALSE, NOW(), NOW()) - """, - [name], - ) - except IntegrityError: - # If another process inserted the same entity before us, we can - # safely ignore the error and fetch the ID - pass - - (entity_id,) = self.fetch_one( - """ - SELECT id FROM entities - WHERE name = %s - """, - [name], - ) - # Cache the newly created entity - self.entity_id_by_normalised_name[normalize_entity_name(name)] = entity_id - return cast(int, entity_id) - - def prefill_entity_cache(self, names: List[str]) -> None: - rows = self.fetch_many( - """ - SELECT - name, - id - FROM entities - WHERE - entities.name IN %(country_names)s - ORDER BY entities.id ASC - """, - {"country_names": [normalize_entity_name(x) for x in names]}, - ) - # Merge the two dicts - self.entity_id_by_normalised_name.update( - { - # entityName → entityId - **dict((row[0], row[1]) for row in rows if row[1]), - } - ) diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py index ced1c8bf3f1..04c6467cf47 100644 --- a/etl/grapher_helpers.py +++ b/etl/grapher_helpers.py @@ -1,5 +1,4 @@ import copy -import warnings from copy import deepcopy from dataclasses import dataclass, field, is_dataclass from pathlib import Path @@ -10,12 +9,12 @@ import pandas as pd import structlog from jinja2 import Environment +from MySQLdb import IntegrityError from owid import catalog from owid.catalog.utils import underscore from sqlalchemy.engine import Engine from etl.db import get_engine, read_sql -from etl.db_utils import DBUtils from etl.files import checksum_str log = structlog.get_logger() @@ -313,16 +312,41 @@ def _get_entities_from_db( def _get_and_create_entities_in_db(countries: Set[str], engine: Engine | None = None) -> Dict[str, int]: engine = engine or get_engine() with engine.connect() as con: - cursor = con.connection.cursor() - db = DBUtils(cursor) log.info("Creating entities in DB", countries=countries) - return {name: db.get_or_create_entity(name) for name in countries} + out = {} + for name in countries: + try: + con.execute( + """ + INSERT INTO entities + (name, displayName, validated, createdAt, updatedAt) + VALUES + (%(name)s, '', FALSE, NOW(), NOW()) + """, + {"name": name}, + ) + except IntegrityError: + # If another process inserted the same entity before us, we can + # safely ignore the error and fetch the ID + pass + + row = con.execute( + """ + SELECT id FROM entities + WHERE name = %(name)s + """, + {"name": name}, + ).fetchone() + assert row + + out[name] = row[0] + + return out def country_to_entity_id( country: pd.Series, create_entities: bool = False, - errors: Literal["raise", "ignore", "warn"] = "raise", by: Literal["name", "code"] = "name", engine: Engine | None = None, ) -> pd.Series: @@ -347,19 +371,9 @@ def country_to_entity_id( # cast to float to fix issues with categories entity_id[ix] = country[ix].map(_get_and_create_entities_in_db(set(country[ix]), engine=engine)).astype(float) - if entity_id.isnull().any(): - msg = f"Some countries have not been mapped: {set(country[entity_id.isnull()])}" - if errors == "raise": - raise ValueError(msg) - elif errors == "warn": - warnings.warn(msg) - elif errors == "ignore": - pass - - # Int64 allows NaN values - return cast(pd.Series, entity_id.astype("Int64")) - else: - return cast(pd.Series, entity_id.astype(int)) + assert not entity_id.isnull().any(), f"Some countries have not been mapped: {set(country[entity_id.isnull()])}" + + return cast(pd.Series, entity_id.astype(int)) def _unique(x: List[Any]) -> List[Any]: diff --git a/etl/grapher_import.py b/etl/grapher_import.py index 346b42a5043..fe4c4fa82a1 100644 --- a/etl/grapher_import.py +++ b/etl/grapher_import.py @@ -30,7 +30,6 @@ ) from apps.backport.datasync.datasync import upload_gzip_dict from etl import config -from etl.db import open_db from . import grapher_helpers as gh from . import grapher_model as gm @@ -359,7 +358,7 @@ def set_dataset_checksum_and_editedAt(dataset_id: int, checksum: str) -> None: session.commit() -def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -> None: +def cleanup_ghost_variables(engine: Engine, dataset_id: int, upserted_variable_ids: List[int]) -> None: """Remove all leftover variables that didn't get upserted into DB during grapher step. This could happen when you rename or delete a variable in ETL. Raise an error if we try to delete variable used by any chart. @@ -368,15 +367,14 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - :param upserted_variable_ids: variables upserted in grapher step :param workers: delete variables in parallel """ - with open_db() as db: + with engine.connect() as con: # get all those variables first - db.cursor.execute( + rows = con.execute( """ SELECT id FROM variables WHERE datasetId=%(dataset_id)s AND id NOT IN %(variable_ids)s """, {"dataset_id": dataset_id, "variable_ids": upserted_variable_ids or [-1]}, - ) - rows = db.cursor.fetchall() + ).fetchall() variable_ids_to_delete = [row[0] for row in rows] @@ -387,19 +385,18 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - log.info("cleanup_ghost_variables.start", size=len(variable_ids_to_delete)) # raise an exception if they're used in any charts - db.cursor.execute( + rows = con.execute( """ SELECT chartId, variableId FROM chart_dimensions WHERE variableId IN %(variable_ids)s """, {"dataset_id": dataset_id, "variable_ids": variable_ids_to_delete}, - ) - rows = db.cursor.fetchall() + ).fetchall() if rows: rows = pd.DataFrame(rows, columns=["chartId", "variableId"]) raise ValueError(f"Variables used in charts will not be deleted automatically:\n{rows}") # then variables themselves with related data in other tables - db.cursor.execute( + con.execute( """ DELETE FROM country_latest_data WHERE variable_id IN %(variable_ids)s """, @@ -407,19 +404,19 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - ) # delete relationships - db.cursor.execute( + con.execute( """ DELETE FROM origins_variables WHERE variableId IN %(variable_ids)s """, {"variable_ids": variable_ids_to_delete}, ) - db.cursor.execute( + con.execute( """ DELETE FROM tags_variables_topic_tags WHERE variableId IN %(variable_ids)s """, {"variable_ids": variable_ids_to_delete}, ) - db.cursor.execute( + con.execute( """ DELETE FROM posts_gdocs_variables_faqs WHERE variableId IN %(variable_ids)s """, @@ -427,7 +424,7 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - ) # delete them from explorers - db.cursor.execute( + con.execute( """ DELETE FROM explorer_variables WHERE variableId IN %(variable_ids)s """, @@ -435,7 +432,7 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - ) # finally delete variables - db.cursor.execute( + result = con.execute( """ DELETE FROM variables WHERE datasetId=%(dataset_id)s AND id IN %(variable_ids)s """, @@ -444,34 +441,34 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - log.warning( "cleanup_ghost_variables.end", - size=db.cursor.rowcount, + size=result.rowcount, variables=variable_ids_to_delete, ) -def cleanup_ghost_sources(dataset_id: int, upserted_source_ids: List[int]) -> None: +def cleanup_ghost_sources(engine: Engine, dataset_id: int, upserted_source_ids: List[int]) -> None: """Remove all leftover sources that didn't get upserted into DB during grapher step. This could happen when you rename or delete sources. :param dataset_id: ID of the dataset :param upserted_source_ids: sources upserted in grapher step """ - with open_db() as db: + with engine.connect() as con: if upserted_source_ids: - db.cursor.execute( + result = con.execute( """ DELETE FROM sources WHERE datasetId=%(dataset_id)s AND id NOT IN %(source_ids)s """, {"dataset_id": dataset_id, "source_ids": upserted_source_ids}, ) else: - db.cursor.execute( + result = con.execute( """ DELETE FROM sources WHERE datasetId=%(dataset_id)s """, {"dataset_id": dataset_id}, ) - if db.cursor.rowcount > 0: - log.warning(f"Deleted {db.cursor.rowcount} ghost sources") + if result.rowcount > 0: + log.warning(f"Deleted {result.rowcount} ghost sources") def _get_entity_name(session: Session, entity_id: int) -> str: diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py index b47f18ea2e8..af9ccda8d60 100644 --- a/etl/steps/__init__.py +++ b/etl/steps/__init__.py @@ -29,6 +29,7 @@ from owid.walden import CATALOG as WALDEN_CATALOG from owid.walden import Catalog as WaldenCatalog from owid.walden import Dataset as WaldenDataset +from sqlalchemy.engine import Engine from etl import config, files, git, paths from etl import grapher_helpers as gh @@ -853,7 +854,7 @@ def run(self) -> None: variable_upsert_results = [future.result() for future in as_completed(futures)] if not config.GRAPHER_FILTER and not config.SUBSET: - self._cleanup_ghost_resources(dataset_upsert_results, variable_upsert_results) + self._cleanup_ghost_resources(engine, dataset_upsert_results, variable_upsert_results) # set checksum and updatedAt timestamps after all data got inserted gi.set_dataset_checksum_and_editedAt(dataset_upsert_results.dataset_id, self.data_step.checksum_input()) @@ -864,6 +865,7 @@ def checksum_output(self) -> str: @classmethod def _cleanup_ghost_resources( cls, + engine: Engine, dataset_upsert_results, variable_upsert_results: List[Any], ) -> None: @@ -882,10 +884,11 @@ def _cleanup_ghost_resources( # Try to cleanup ghost variables, but make sure to raise an error if they are used # in any chart gi.cleanup_ghost_variables( + engine, dataset_upsert_results.dataset_id, upserted_variable_ids, ) - gi.cleanup_ghost_sources(dataset_upsert_results.dataset_id, upserted_source_ids) + gi.cleanup_ghost_sources(engine, dataset_upsert_results.dataset_id, upserted_source_ids) # TODO: cleanup origins that are not used by any variable From 3c918a0bf710e58637c7d7ef104bf9721822f4fe Mon Sep 17 00:00:00 2001 From: owidbot Date: Wed, 17 Apr 2024 11:44:08 +0000 Subject: [PATCH 36/40] fasttrack: fasttrack/2024-04-17/qubits.csv --- dag/fasttrack.yml | 2 ++ .../fasttrack/2024-04-17/qubits.meta.yml | 16 +++++++++++++ .../grapher/fasttrack/2024-04-17/qubits.py | 22 +++++++++++++++++ snapshots/fasttrack/2024-04-17/qubits.csv.dvc | 24 +++++++++++++++++++ 4 files changed, 64 insertions(+) create mode 100644 etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml create mode 100644 etl/steps/data/grapher/fasttrack/2024-04-17/qubits.py create mode 100644 snapshots/fasttrack/2024-04-17/qubits.csv.dvc diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml index 8912820bea8..3637725ec67 100644 --- a/dag/fasttrack.yml +++ b/dag/fasttrack.yml @@ -158,3 +158,5 @@ steps: - snapshot://fasttrack/latest/gpei.csv data-private://grapher/fasttrack/latest/conflict_deaths_combined: - snapshot-private://fasttrack/latest/conflict_deaths_combined.csv + data://grapher/fasttrack/2024-04-17/qubits: + - snapshot://fasttrack/2024-04-17/qubits.csv diff --git a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml new file mode 100644 index 00000000000..dfd2fcf0e76 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml @@ -0,0 +1,16 @@ +dataset: + title: Quantum processors over time + description: '' + licenses: + - name: Creative Commons Attribution-ShareAlike 3.0 + url: https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License +tables: + qubits: + variables: + qubits: + title: Record number of quantum bits per processor + unit: qubits + short_unit: qb + display: + numDecimalPlaces: 0 + description: Highest number of quantum bits in a single circuit-based quantum processor over time diff --git a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.py b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.py new file mode 100644 index 00000000000..c25364bd965 --- /dev/null +++ b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.py @@ -0,0 +1,22 @@ +from etl.helpers import PathFinder, create_dataset, get_metadata_path +from etl.snapshot import Snapshot + +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # load snapshot + snap = Snapshot("fasttrack/2024-04-17/qubits.csv") + + # load data + tb = snap.read_csv() + + # add table, update metadata from *.meta.yml and save + ds = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata) + + # override metadata if necessary + meta_path = get_metadata_path(dest_dir).with_suffix(".override.yml") + if meta_path.exists(): + ds.update_metadata(meta_path) + + ds.save() diff --git a/snapshots/fasttrack/2024-04-17/qubits.csv.dvc b/snapshots/fasttrack/2024-04-17/qubits.csv.dvc new file mode 100644 index 00000000000..93267470810 --- /dev/null +++ b/snapshots/fasttrack/2024-04-17/qubits.csv.dvc @@ -0,0 +1,24 @@ +meta: + origin: + producer: Wikipedia + title: List of quantum processors + title_snapshot: Circuit-based quantum processors + citation_full: Wikipedia, List of quantum processors, Circuit-based quantum processors + version_producer: Google Sheet + url_main: https://en.wikipedia.org/wiki/List_of_quantum_processors + url_download: |- + https://docs.google.com/spreadsheets/d/e/2PACX-1vSVB8MqM1U7xLUV68Fd8TUeMiv2jWWGeT8EteyP-0Nvi4getanr9gxxlM0V1JhIlRkhfruB7vjfBTIy/pub?output=csv + date_accessed: '2024-04-17' + date_published: '2024' + license: + name: CC BY-SA + url: https://en.wikipedia.org/wiki/Wikipedia:Copyrights + name: Quantum processors over time + description: '' + license: + name: Creative Commons Attribution-ShareAlike 3.0 + url: https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License +outs: + - md5: 33a1d150a0261088258fa8b4b4a51a34 + size: 135 + path: qubits.csv From 823bb01741a48ff92ee2f71d1e1ad0625fefba0d Mon Sep 17 00:00:00 2001 From: owidbot Date: Wed, 17 Apr 2024 12:10:27 +0000 Subject: [PATCH 37/40] fasttrack: fasttrack/2024-04-17/qubits.csv --- .../fasttrack/2024-04-17/qubits.meta.yml | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml index dfd2fcf0e76..40283d1b22d 100644 --- a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml @@ -4,6 +4,7 @@ dataset: licenses: - name: Creative Commons Attribution-ShareAlike 3.0 url: https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License + update_period_days: '365' tables: qubits: variables: @@ -13,4 +14,32 @@ tables: short_unit: qb display: numDecimalPlaces: 0 + description_key: + - '[' + - '"' + - t + - e + - s + - t + - '' + - '1' + - '"' + - ',' + - '' + - '"' + - t + - e + - s + - t + - '' + - '2' + - '"' + - ']' + description_from_producer: |- + This list contains quantum processors, also known as quantum processing units (QPUs). Some devices listed below have only been announced at press conferences so far, with no actual demonstrations or scientific publications characterizing the performance. + + Quantum processors are difficult to compare due to the different architectures and approaches. Due to this, published qubit numbers do not reflect the performance levels of the processor. This is instead achieved through benchmarking metrics such as quantum volume, randomized benchmarking or circuit layer operations per second (CLOPS). + + These QPUs are based on the quantum circuit and quantum logic gate-based model of computing. + processing_level: minor description: Highest number of quantum bits in a single circuit-based quantum processor over time From 22648a3523c7c8ede67671685bd7f9c1e27e0c43 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 17 Apr 2024 14:13:14 +0200 Subject: [PATCH 38/40] Update climate data and fix co2 concentration issue (#2531) * Archive unused climate steps * Add steps for all updated climate datasets, including EPA datasets * Archive unused climate steps --- dag/archive/climate.yml | 229 +++++++++++++++ dag/climate.yml | 275 ++++++------------ .../climate_change_impacts.meta.yml | 24 ++ .../2024-04-17/climate_change_impacts.py | 174 +++++++++++ .../2024-04-17/ghg_concentration.meta.yml | 44 +++ .../climate/2024-04-17/ghg_concentration.py | 143 +++++++++ .../long_run_ghg_concentration.meta.yml | 27 ++ .../2024-04-17/long_run_ghg_concentration.py | 84 ++++++ .../2024-04-17/ocean_heat_content.meta.yml | 29 ++ .../climate/2024-04-17/ocean_heat_content.py | 45 +++ .../2024-04-17/ocean_ph_levels.meta.yml | 22 ++ .../climate/2024-04-17/ocean_ph_levels.py | 86 ++++++ .../climate/2024-04-17/sea_ice_index.meta.yml | 19 ++ .../climate/2024-04-17/sea_ice_index.py | 44 +++ .../sea_surface_temperature.meta.yml | 29 ++ .../2024-04-17/sea_surface_temperature.py | 48 +++ .../2024-04-17/snow_cover_extent.meta.yml | 23 ++ .../climate/2024-04-17/snow_cover_extent.py | 97 ++++++ .../surface_temperature_analysis.meta.yml | 20 ++ .../surface_temperature_analysis.py | 56 ++++ .../epa/2024-04-17/ghg_concentration.meta.yml | 30 ++ .../epa/2024-04-17/ghg_concentration.py | 75 +++++ .../ice_sheet_mass_balance.meta.yml | 31 ++ .../epa/2024-04-17/ice_sheet_mass_balance.py | 91 ++++++ .../mass_balance_us_glaciers.meta.yml | 17 ++ .../2024-04-17/mass_balance_us_glaciers.py | 39 +++ .../2024-04-17/ocean_heat_content.meta.yml | 34 +++ .../epa/2024-04-17/ocean_heat_content.py | 35 +++ .../climate_change_impacts_annual.py | 34 +++ .../climate_change_impacts_monthly.py | 37 +++ .../climate/2024-04-17/ghg_concentration.py | 42 +++ .../2024-04-17/hawaii_ocean_time_series.py | 29 ++ .../climate/2024-04-17/ocean_heat_content.py | 75 +++++ .../climate/2024-04-17/sea_ice_index.py | 51 ++++ .../2024-04-17/sea_surface_temperature.py | 49 ++++ .../climate/2024-04-17/snow_cover_extent.py | 50 ++++ .../surface_temperature_analysis.py | 62 ++++ .../epa/2024-04-17/ghg_concentration.py | 69 +++++ .../epa/2024-04-17/ice_sheet_mass_balance.py | 31 ++ .../2024-04-17/mass_balance_us_glaciers.py | 27 ++ .../epa/2024-04-17/ocean_heat_content.py | 63 ++++ .../ch4_concentration_monthly.csv.dvc | 23 ++ .../2024-04-17/climate_change_impacts.py | 202 +++++++++++++ .../co2_concentration_monthly.csv.dvc | 23 ++ .../hawaii_ocean_time_series.csv.dvc | 25 ++ .../n2o_concentration_monthly.csv.dvc | 23 ++ ...an_heat_content_annual_world_2000m.csv.dvc | 27 ++ ...ean_heat_content_annual_world_700m.csv.dvc | 27 ++ ...n_heat_content_monthly_world_2000m.csv.dvc | 28 ++ ...an_heat_content_monthly_world_700m.csv.dvc | 28 ++ .../climate/2024-04-17/sea_ice_index.xlsx.dvc | 19 ++ ...ce_temperature_northern_hemisphere.csv.dvc | 26 ++ ...ce_temperature_southern_hemisphere.csv.dvc | 26 ++ .../sea_surface_temperature_world.csv.dvc | 26 ++ .../snow_cover_extent_north_america.csv.dvc | 22 ++ ...w_cover_extent_northern_hemisphere.csv.dvc | 22 ++ ...ature_analysis_northern_hemisphere.csv.dvc | 19 ++ ...ature_analysis_southern_hemisphere.csv.dvc | 19 ++ ...surface_temperature_analysis_world.csv.dvc | 19 ++ .../epa/2024-04-17/ch4_concentration.csv.dvc | 29 ++ .../2024-04-17/climate_change_indicators.py | 43 +++ .../epa/2024-04-17/co2_concentration.csv.dvc | 29 ++ .../2024-04-17/ice_sheet_mass_balance.csv.dvc | 37 +++ .../mass_balance_us_glaciers.csv.dvc | 28 ++ .../epa/2024-04-17/n2o_concentration.csv.dvc | 29 ++ ...an_heat_content_annual_world_2000m.csv.dvc | 32 ++ ...ean_heat_content_annual_world_700m.csv.dvc | 32 ++ 67 files changed, 3167 insertions(+), 185 deletions(-) create mode 100644 etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py create mode 100644 etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py create mode 100644 etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py create mode 100644 etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py create mode 100644 etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py create mode 100644 etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py create mode 100644 etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py create mode 100644 etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py create mode 100644 etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml create mode 100644 etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py create mode 100644 etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml create mode 100644 etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py create mode 100644 etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml create mode 100644 etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py create mode 100644 etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml create mode 100644 etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py create mode 100644 etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml create mode 100644 etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py create mode 100644 etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_annual.py create mode 100644 etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_monthly.py create mode 100644 etl/steps/data/meadow/climate/2024-04-17/ghg_concentration.py create mode 100644 etl/steps/data/meadow/climate/2024-04-17/hawaii_ocean_time_series.py create mode 100644 etl/steps/data/meadow/climate/2024-04-17/ocean_heat_content.py create mode 100644 etl/steps/data/meadow/climate/2024-04-17/sea_ice_index.py create mode 100644 etl/steps/data/meadow/climate/2024-04-17/sea_surface_temperature.py create mode 100644 etl/steps/data/meadow/climate/2024-04-17/snow_cover_extent.py create mode 100644 etl/steps/data/meadow/climate/2024-04-17/surface_temperature_analysis.py create mode 100644 etl/steps/data/meadow/epa/2024-04-17/ghg_concentration.py create mode 100644 etl/steps/data/meadow/epa/2024-04-17/ice_sheet_mass_balance.py create mode 100644 etl/steps/data/meadow/epa/2024-04-17/mass_balance_us_glaciers.py create mode 100644 etl/steps/data/meadow/epa/2024-04-17/ocean_heat_content.py create mode 100644 snapshots/climate/2024-04-17/ch4_concentration_monthly.csv.dvc create mode 100644 snapshots/climate/2024-04-17/climate_change_impacts.py create mode 100644 snapshots/climate/2024-04-17/co2_concentration_monthly.csv.dvc create mode 100644 snapshots/climate/2024-04-17/hawaii_ocean_time_series.csv.dvc create mode 100644 snapshots/climate/2024-04-17/n2o_concentration_monthly.csv.dvc create mode 100644 snapshots/climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc create mode 100644 snapshots/climate/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc create mode 100644 snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv.dvc create mode 100644 snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv.dvc create mode 100644 snapshots/climate/2024-04-17/sea_ice_index.xlsx.dvc create mode 100644 snapshots/climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv.dvc create mode 100644 snapshots/climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv.dvc create mode 100644 snapshots/climate/2024-04-17/sea_surface_temperature_world.csv.dvc create mode 100644 snapshots/climate/2024-04-17/snow_cover_extent_north_america.csv.dvc create mode 100644 snapshots/climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv.dvc create mode 100644 snapshots/climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv.dvc create mode 100644 snapshots/climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv.dvc create mode 100644 snapshots/climate/2024-04-17/surface_temperature_analysis_world.csv.dvc create mode 100644 snapshots/epa/2024-04-17/ch4_concentration.csv.dvc create mode 100644 snapshots/epa/2024-04-17/climate_change_indicators.py create mode 100644 snapshots/epa/2024-04-17/co2_concentration.csv.dvc create mode 100644 snapshots/epa/2024-04-17/ice_sheet_mass_balance.csv.dvc create mode 100644 snapshots/epa/2024-04-17/mass_balance_us_glaciers.csv.dvc create mode 100644 snapshots/epa/2024-04-17/n2o_concentration.csv.dvc create mode 100644 snapshots/epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc create mode 100644 snapshots/epa/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc diff --git a/dag/archive/climate.yml b/dag/archive/climate.yml index 64995e71adc..ff8675494a9 100644 --- a/dag/archive/climate.yml +++ b/dag/archive/climate.yml @@ -18,3 +18,232 @@ steps: - snapshot://imbie/2024-01-02/ice_sheet_mass_balance_greenland.csv data://garden/imbie/2024-01-02/ice_sheet_mass_balance: - data://meadow/imbie/2024-01-02/ice_sheet_mass_balance + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://meadow/climate/2024-01-31/sea_surface_temperature: + - snapshot://climate/2024-01-31/sea_surface_temperature_world.csv + - snapshot://climate/2024-01-31/sea_surface_temperature_northern_hemisphere.csv + - snapshot://climate/2024-01-31/sea_surface_temperature_southern_hemisphere.csv + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://garden/climate/2024-01-31/sea_surface_temperature: + - data://meadow/climate/2024-01-31/sea_surface_temperature + # + # GISS - Surface temperature analysis. + # + data://meadow/climate/2024-01-31/surface_temperature_analysis: + - snapshot://climate/2024-01-31/surface_temperature_analysis_world.csv + - snapshot://climate/2024-01-31/surface_temperature_analysis_northern_hemisphere.csv + - snapshot://climate/2024-01-31/surface_temperature_analysis_southern_hemisphere.csv + # + # GISS - Surface temperature analysis. + # + data://garden/climate/2024-01-31/surface_temperature_analysis: + - data://meadow/climate/2024-01-31/surface_temperature_analysis + # + # NSIDC - Arctic sea ice extent. + # + data://meadow/climate/2024-01-31/sea_ice_index: + - snapshot://climate/2024-01-31/sea_ice_index.xlsx + # + # NSIDC - Arctic sea ice extent. + # + data://garden/climate/2024-01-31/sea_ice_index: + - data://meadow/climate/2024-01-31/sea_ice_index + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://meadow/climate/2024-01-31/ocean_heat_content: + - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_700m.csv + - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_2000m.csv + - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_700m.csv + - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_2000m.csv + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://garden/climate/2024-01-31/ocean_heat_content: + - data://meadow/climate/2024-01-31/ocean_heat_content + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://meadow/climate/2024-01-31/hawaii_ocean_time_series: + - snapshot://climate/2024-01-31/hawaii_ocean_time_series.csv + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://garden/climate/2024-01-31/ocean_ph_levels: + - data://meadow/climate/2024-01-31/hawaii_ocean_time_series + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://meadow/climate/2024-01-31/snow_cover_extent: + - snapshot://climate/2024-01-31/snow_cover_extent_north_america.csv + - snapshot://climate/2024-01-31/snow_cover_extent_northern_hemisphere.csv + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://garden/climate/2024-01-31/snow_cover_extent: + - data://meadow/climate/2024-01-31/snow_cover_extent + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://meadow/climate/2024-01-31/ghg_concentration: + - snapshot://climate/2024-01-31/co2_concentration_monthly.csv + - snapshot://climate/2024-01-31/ch4_concentration_monthly.csv + - snapshot://climate/2024-01-31/n2o_concentration_monthly.csv + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://garden/climate/2024-01-31/ghg_concentration: + - data://meadow/climate/2024-01-31/ghg_concentration + # + # Various sources - Long-run greenhouse gas concentration. + # + data://garden/climate/2024-01-31/long_run_ghg_concentration: + - data://garden/epa/2024-01-29/ghg_concentration + - data://garden/climate/2024-01-31/ghg_concentration + # + # Various sources - Climate change impacts. + # + data://garden/climate/2024-01-31/climate_change_impacts: + - data://garden/climate/2024-01-31/surface_temperature_analysis + - data://garden/climate/2024-01-31/sea_ice_index + - data://garden/climate/2024-01-31/sea_surface_temperature + - data://garden/climate/2024-01-31/ocean_heat_content + - data://garden/climate/2024-01-31/ocean_ph_levels + - data://garden/climate/2024-01-31/snow_cover_extent + - data://garden/climate/2024-01-31/ghg_concentration + - data://garden/climate/2024-01-31/long_run_ghg_concentration + - data://garden/climate/2024-01-28/global_sea_level + - data://garden/epa/2024-01-29/ocean_heat_content + - data://garden/epa/2024-01-29/ice_sheet_mass_balance + - data://garden/epa/2024-01-29/mass_balance_us_glaciers + # + # Various sources - Climate change impacts (annual). + # + data://grapher/climate/2024-01-31/climate_change_impacts_annual: + - data://garden/climate/2024-01-31/climate_change_impacts + # + # Various sources - Climate change impacts (monthly). + # + data://grapher/climate/2024-01-31/climate_change_impacts_monthly: + - data://garden/climate/2024-01-31/climate_change_impacts + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://meadow/climate/2024-03-11/hawaii_ocean_time_series: + - snapshot://climate/2024-03-11/hawaii_ocean_time_series.csv + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://meadow/climate/2024-03-11/ocean_heat_content: + - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_700m.csv + - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_2000m.csv + - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_700m.csv + - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_2000m.csv + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://meadow/climate/2024-03-11/snow_cover_extent: + - snapshot://climate/2024-03-11/snow_cover_extent_northern_hemisphere.csv + - snapshot://climate/2024-03-11/snow_cover_extent_north_america.csv + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://meadow/climate/2024-03-11/sea_surface_temperature: + - snapshot://climate/2024-03-11/sea_surface_temperature_northern_hemisphere.csv + - snapshot://climate/2024-03-11/sea_surface_temperature_southern_hemisphere.csv + - snapshot://climate/2024-03-11/sea_surface_temperature_world.csv + # + # NSIDC - Arctic sea ice extent. + # + data://meadow/climate/2024-03-11/sea_ice_index: + - snapshot://climate/2024-03-11/sea_ice_index.xlsx + # + # GISS - Surface temperature analysis. + # + data://meadow/climate/2024-03-11/surface_temperature_analysis: + - snapshot://climate/2024-03-11/surface_temperature_analysis_southern_hemisphere.csv + - snapshot://climate/2024-03-11/surface_temperature_analysis_northern_hemisphere.csv + - snapshot://climate/2024-03-11/surface_temperature_analysis_world.csv + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://meadow/climate/2024-03-11/ghg_concentration: + - snapshot://climate/2024-03-11/n2o_concentration_monthly.csv + - snapshot://climate/2024-03-11/co2_concentration_monthly.csv + - snapshot://climate/2024-03-11/ch4_concentration_monthly.csv + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://garden/climate/2024-03-11/ocean_ph_levels: + - data://meadow/climate/2024-03-11/hawaii_ocean_time_series + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://garden/climate/2024-03-11/ocean_heat_content: + - data://meadow/climate/2024-03-11/ocean_heat_content + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://garden/climate/2024-03-11/snow_cover_extent: + - data://meadow/climate/2024-03-11/snow_cover_extent + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://garden/climate/2024-03-11/sea_surface_temperature: + - data://meadow/climate/2024-03-11/sea_surface_temperature + # + # NSIDC - Arctic sea ice extent. + # + data://garden/climate/2024-03-11/sea_ice_index: + - data://meadow/climate/2024-03-11/sea_ice_index + # + # GISS - Surface temperature analysis. + # + data://garden/climate/2024-03-11/surface_temperature_analysis: + - data://meadow/climate/2024-03-11/surface_temperature_analysis + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://garden/climate/2024-03-11/ghg_concentration: + - data://meadow/climate/2024-03-11/ghg_concentration + # + # Various sources - Long-run greenhouse gas concentration. + # + data://garden/climate/2024-03-11/long_run_ghg_concentration: + - data://garden/epa/2024-01-29/ghg_concentration + - data://garden/climate/2024-03-11/ghg_concentration + # + # Various sources - Climate change impacts. + # + data://garden/climate/2024-03-11/climate_change_impacts: + - data://garden/climate/2024-03-11/long_run_ghg_concentration + - data://garden/climate/2024-01-28/global_sea_level + - data://garden/epa/2024-01-29/ice_sheet_mass_balance + - data://garden/epa/2024-01-29/mass_balance_us_glaciers + - data://garden/epa/2024-01-29/ocean_heat_content + - data://garden/climate/2024-03-11/ocean_heat_content + - data://garden/climate/2024-03-11/surface_temperature_analysis + - data://garden/climate/2024-03-11/sea_ice_index + - data://garden/climate/2024-03-11/ghg_concentration + - data://garden/climate/2024-03-11/ocean_ph_levels + - data://garden/climate/2024-03-11/sea_surface_temperature + - data://garden/climate/2024-03-11/snow_cover_extent + # + # Various sources - Climate change impacts (monthly). + # + data://grapher/climate/2024-03-11/climate_change_impacts_monthly: + - data://garden/climate/2024-03-11/climate_change_impacts + # + # Various sources - Climate change impacts (annual). + # + data://grapher/climate/2024-03-11/climate_change_impacts_annual: + - data://garden/climate/2024-03-11/climate_change_impacts + # + # GISS - Surface temperature analysis. + # + data://grapher/climate/latest/surface_temperature_analysis: + - data://garden/climate/2024-03-11/surface_temperature_analysis diff --git a/dag/climate.yml b/dag/climate.yml index fd50b88cbeb..8dd0e2f1a32 100644 --- a/dag/climate.yml +++ b/dag/climate.yml @@ -54,127 +54,10 @@ steps: data://grapher/climate/2023-12-20/surface_temperature_annual_average: - data://garden/climate/2023-12-20/surface_temperature # - # Met Office Hadley Centre - Sea surface temperature. - # - data://meadow/climate/2024-01-31/sea_surface_temperature: - - snapshot://climate/2024-01-31/sea_surface_temperature_world.csv - - snapshot://climate/2024-01-31/sea_surface_temperature_northern_hemisphere.csv - - snapshot://climate/2024-01-31/sea_surface_temperature_southern_hemisphere.csv - # - # Met Office Hadley Centre - Sea surface temperature. - # - data://garden/climate/2024-01-31/sea_surface_temperature: - - data://meadow/climate/2024-01-31/sea_surface_temperature - # - # GISS - Surface temperature analysis. - # - data://meadow/climate/2024-01-31/surface_temperature_analysis: - - snapshot://climate/2024-01-31/surface_temperature_analysis_world.csv - - snapshot://climate/2024-01-31/surface_temperature_analysis_northern_hemisphere.csv - - snapshot://climate/2024-01-31/surface_temperature_analysis_southern_hemisphere.csv - # - # GISS - Surface temperature analysis. - # - data://garden/climate/2024-01-31/surface_temperature_analysis: - - data://meadow/climate/2024-01-31/surface_temperature_analysis - # - # GISS - Surface temperature analysis. - # - data://grapher/climate/latest/surface_temperature_analysis: - - data://garden/climate/2024-01-31/surface_temperature_analysis - # - # NSIDC - Arctic sea ice extent. - # - data://meadow/climate/2024-01-31/sea_ice_index: - - snapshot://climate/2024-01-31/sea_ice_index.xlsx - # - # NSIDC - Arctic sea ice extent. - # - data://garden/climate/2024-01-31/sea_ice_index: - - data://meadow/climate/2024-01-31/sea_ice_index - # - # NOAA National Centers for Environmental Information - Ocean Heat Content. - # - data://meadow/climate/2024-01-31/ocean_heat_content: - - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_700m.csv - - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_2000m.csv - - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_700m.csv - - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_2000m.csv - # - # NOAA National Centers for Environmental Information - Ocean Heat Content. - # - data://garden/climate/2024-01-31/ocean_heat_content: - - data://meadow/climate/2024-01-31/ocean_heat_content - # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). - # - data://meadow/climate/2024-01-31/hawaii_ocean_time_series: - - snapshot://climate/2024-01-31/hawaii_ocean_time_series.csv - # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). - # - data://garden/climate/2024-01-31/ocean_ph_levels: - - data://meadow/climate/2024-01-31/hawaii_ocean_time_series - # - # Rutgers University Global Snow Lab - Snow Cover Extent. - # - data://meadow/climate/2024-01-31/snow_cover_extent: - - snapshot://climate/2024-01-31/snow_cover_extent_north_america.csv - - snapshot://climate/2024-01-31/snow_cover_extent_northern_hemisphere.csv - # - # Rutgers University Global Snow Lab - Snow Cover Extent. - # - data://garden/climate/2024-01-31/snow_cover_extent: - - data://meadow/climate/2024-01-31/snow_cover_extent - # - # NOAA Global Monitoring Laboratory - GHG concentration. - # - data://meadow/climate/2024-01-31/ghg_concentration: - - snapshot://climate/2024-01-31/co2_concentration_monthly.csv - - snapshot://climate/2024-01-31/ch4_concentration_monthly.csv - - snapshot://climate/2024-01-31/n2o_concentration_monthly.csv - # - # NOAA Global Monitoring Laboratory - GHG concentration. - # - data://garden/climate/2024-01-31/ghg_concentration: - - data://meadow/climate/2024-01-31/ghg_concentration - # - # Various sources - Long-run greenhouse gas concentration. - # - data://garden/climate/2024-01-31/long_run_ghg_concentration: - - data://garden/epa/2024-01-29/ghg_concentration - - data://garden/climate/2024-01-31/ghg_concentration - # - # Various sources - Climate change impacts. - # - data://garden/climate/2024-01-31/climate_change_impacts: - - data://garden/climate/2024-01-31/surface_temperature_analysis - - data://garden/climate/2024-01-31/sea_ice_index - - data://garden/climate/2024-01-31/sea_surface_temperature - - data://garden/climate/2024-01-31/ocean_heat_content - - data://garden/climate/2024-01-31/ocean_ph_levels - - data://garden/climate/2024-01-31/snow_cover_extent - - data://garden/climate/2024-01-31/ghg_concentration - - data://garden/climate/2024-01-31/long_run_ghg_concentration - - data://garden/climate/2024-01-28/global_sea_level - - data://garden/epa/2024-01-29/ocean_heat_content - - data://garden/epa/2024-01-29/ice_sheet_mass_balance - - data://garden/epa/2024-01-29/mass_balance_us_glaciers - # - # Various sources - Climate change impacts (annual). - # - data://grapher/climate/2024-01-31/climate_change_impacts_annual: - - data://garden/climate/2024-01-31/climate_change_impacts - # - # Various sources - Climate change impacts (monthly). - # - data://grapher/climate/2024-01-31/climate_change_impacts_monthly: - - data://garden/climate/2024-01-31/climate_change_impacts - # # Climate change impacts data explorer. # data://explorers/climate/latest/climate_change_impacts: - - data://garden/climate/2024-03-11/climate_change_impacts + - data://garden/climate/2024-04-17/climate_change_impacts # # Global Wildfire Information System - Monthly burned area. # @@ -235,114 +118,136 @@ steps: data://grapher/met_office_hadley_centre/2024-03-04/near_surface_temperature: - data://garden/met_office_hadley_centre/2024-03-04/near_surface_temperature # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # EPA - Climate change indicators (possibly not updateable). # - data://meadow/climate/2024-03-11/hawaii_ocean_time_series: - - snapshot://climate/2024-03-11/hawaii_ocean_time_series.csv + data://meadow/epa/2024-04-17/ocean_heat_content: + - snapshot://epa/2024-04-17/ocean_heat_content_annual_world_700m.csv + - snapshot://epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv + data://garden/epa/2024-04-17/ocean_heat_content: + - data://meadow/epa/2024-04-17/ocean_heat_content + data://meadow/epa/2024-04-17/ice_sheet_mass_balance: + - snapshot://epa/2024-04-17/ice_sheet_mass_balance.csv + data://garden/epa/2024-04-17/ice_sheet_mass_balance: + - data://meadow/epa/2024-04-17/ice_sheet_mass_balance + data://meadow/epa/2024-04-17/ghg_concentration: + - snapshot://epa/2024-04-17/co2_concentration.csv + - snapshot://epa/2024-04-17/ch4_concentration.csv + - snapshot://epa/2024-04-17/n2o_concentration.csv + data://garden/epa/2024-04-17/ghg_concentration: + - data://meadow/epa/2024-04-17/ghg_concentration + data://meadow/epa/2024-04-17/mass_balance_us_glaciers: + - snapshot://epa/2024-04-17/mass_balance_us_glaciers.csv + data://garden/epa/2024-04-17/mass_balance_us_glaciers: + - data://meadow/epa/2024-04-17/mass_balance_us_glaciers # - # NOAA National Centers for Environmental Information - Ocean Heat Content. + # Rutgers University Global Snow Lab - Snow Cover Extent. # - data://meadow/climate/2024-03-11/ocean_heat_content: - - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_700m.csv - - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_2000m.csv - - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_700m.csv - - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_2000m.csv + data://meadow/climate/2024-04-17/snow_cover_extent: + - snapshot://climate/2024-04-17/snow_cover_extent_north_america.csv + - snapshot://climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv # - # Rutgers University Global Snow Lab - Snow Cover Extent. + # NOAA National Centers for Environmental Information - Ocean Heat Content. # - data://meadow/climate/2024-03-11/snow_cover_extent: - - snapshot://climate/2024-03-11/snow_cover_extent_northern_hemisphere.csv - - snapshot://climate/2024-03-11/snow_cover_extent_north_america.csv + data://meadow/climate/2024-04-17/ocean_heat_content: + - snapshot://climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv + - snapshot://climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv + - snapshot://climate/2024-04-17/ocean_heat_content_annual_world_700m.csv + - snapshot://climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv # - # Met Office Hadley Centre - Sea surface temperature. + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). # - data://meadow/climate/2024-03-11/sea_surface_temperature: - - snapshot://climate/2024-03-11/sea_surface_temperature_northern_hemisphere.csv - - snapshot://climate/2024-03-11/sea_surface_temperature_southern_hemisphere.csv - - snapshot://climate/2024-03-11/sea_surface_temperature_world.csv + data://meadow/climate/2024-04-17/hawaii_ocean_time_series: + - snapshot://climate/2024-04-17/hawaii_ocean_time_series.csv # # NSIDC - Arctic sea ice extent. # - data://meadow/climate/2024-03-11/sea_ice_index: - - snapshot://climate/2024-03-11/sea_ice_index.xlsx + data://meadow/climate/2024-04-17/sea_ice_index: + - snapshot://climate/2024-04-17/sea_ice_index.xlsx + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://meadow/climate/2024-04-17/sea_surface_temperature: + - snapshot://climate/2024-04-17/sea_surface_temperature_world.csv + - snapshot://climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv + - snapshot://climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv # # GISS - Surface temperature analysis. # - data://meadow/climate/2024-03-11/surface_temperature_analysis: - - snapshot://climate/2024-03-11/surface_temperature_analysis_southern_hemisphere.csv - - snapshot://climate/2024-03-11/surface_temperature_analysis_northern_hemisphere.csv - - snapshot://climate/2024-03-11/surface_temperature_analysis_world.csv + data://meadow/climate/2024-04-17/surface_temperature_analysis: + - snapshot://climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv + - snapshot://climate/2024-04-17/surface_temperature_analysis_world.csv + - snapshot://climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv # # NOAA Global Monitoring Laboratory - GHG concentration. # - data://meadow/climate/2024-03-11/ghg_concentration: - - snapshot://climate/2024-03-11/n2o_concentration_monthly.csv - - snapshot://climate/2024-03-11/co2_concentration_monthly.csv - - snapshot://climate/2024-03-11/ch4_concentration_monthly.csv + data://meadow/climate/2024-04-17/ghg_concentration: + - snapshot://climate/2024-04-17/co2_concentration_monthly.csv + - snapshot://climate/2024-04-17/n2o_concentration_monthly.csv + - snapshot://climate/2024-04-17/ch4_concentration_monthly.csv # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # Rutgers University Global Snow Lab - Snow Cover Extent. # - data://garden/climate/2024-03-11/ocean_ph_levels: - - data://meadow/climate/2024-03-11/hawaii_ocean_time_series + data://garden/climate/2024-04-17/snow_cover_extent: + - data://meadow/climate/2024-04-17/snow_cover_extent # # NOAA National Centers for Environmental Information - Ocean Heat Content. # - data://garden/climate/2024-03-11/ocean_heat_content: - - data://meadow/climate/2024-03-11/ocean_heat_content + data://garden/climate/2024-04-17/ocean_heat_content: + - data://meadow/climate/2024-04-17/ocean_heat_content # - # Rutgers University Global Snow Lab - Snow Cover Extent. + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). # - data://garden/climate/2024-03-11/snow_cover_extent: - - data://meadow/climate/2024-03-11/snow_cover_extent + data://garden/climate/2024-04-17/ocean_ph_levels: + - data://meadow/climate/2024-04-17/hawaii_ocean_time_series # - # Met Office Hadley Centre - Sea surface temperature. + # NSIDC - Arctic sea ice extent. # - data://garden/climate/2024-03-11/sea_surface_temperature: - - data://meadow/climate/2024-03-11/sea_surface_temperature + data://garden/climate/2024-04-17/sea_ice_index: + - data://meadow/climate/2024-04-17/sea_ice_index # - # NSIDC - Arctic sea ice extent. + # Met Office Hadley Centre - Sea surface temperature. # - data://garden/climate/2024-03-11/sea_ice_index: - - data://meadow/climate/2024-03-11/sea_ice_index + data://garden/climate/2024-04-17/sea_surface_temperature: + - data://meadow/climate/2024-04-17/sea_surface_temperature # # GISS - Surface temperature analysis. # - data://garden/climate/2024-03-11/surface_temperature_analysis: - - data://meadow/climate/2024-03-11/surface_temperature_analysis + data://garden/climate/2024-04-17/surface_temperature_analysis: + - data://meadow/climate/2024-04-17/surface_temperature_analysis # # NOAA Global Monitoring Laboratory - GHG concentration. # - data://garden/climate/2024-03-11/ghg_concentration: - - data://meadow/climate/2024-03-11/ghg_concentration + data://garden/climate/2024-04-17/ghg_concentration: + - data://meadow/climate/2024-04-17/ghg_concentration # # Various sources - Long-run greenhouse gas concentration. # - data://garden/climate/2024-03-11/long_run_ghg_concentration: - - data://garden/epa/2024-01-29/ghg_concentration - - data://garden/climate/2024-03-11/ghg_concentration + data://garden/climate/2024-04-17/long_run_ghg_concentration: + - data://garden/epa/2024-04-17/ghg_concentration + - data://garden/climate/2024-04-17/ghg_concentration # # Various sources - Climate change impacts. # - data://garden/climate/2024-03-11/climate_change_impacts: - - data://garden/climate/2024-03-11/long_run_ghg_concentration + data://garden/climate/2024-04-17/climate_change_impacts: + - data://garden/epa/2024-04-17/ocean_heat_content + - data://garden/epa/2024-04-17/mass_balance_us_glaciers + - data://garden/climate/2024-04-17/sea_ice_index - data://garden/climate/2024-01-28/global_sea_level - - data://garden/epa/2024-01-29/ice_sheet_mass_balance - - data://garden/epa/2024-01-29/mass_balance_us_glaciers - - data://garden/epa/2024-01-29/ocean_heat_content - - data://garden/climate/2024-03-11/ocean_heat_content - - data://garden/climate/2024-03-11/surface_temperature_analysis - - data://garden/climate/2024-03-11/sea_ice_index - - data://garden/climate/2024-03-11/ghg_concentration - - data://garden/climate/2024-03-11/ocean_ph_levels - - data://garden/climate/2024-03-11/sea_surface_temperature - - data://garden/climate/2024-03-11/snow_cover_extent + - data://garden/epa/2024-04-17/ice_sheet_mass_balance + - data://garden/climate/2024-04-17/ghg_concentration + - data://garden/climate/2024-04-17/ocean_ph_levels + - data://garden/climate/2024-04-17/surface_temperature_analysis + - data://garden/climate/2024-04-17/snow_cover_extent + - data://garden/climate/2024-04-17/sea_surface_temperature + - data://garden/climate/2024-04-17/ocean_heat_content + - data://garden/climate/2024-04-17/long_run_ghg_concentration # - # Various sources - Climate change impacts (monthly). + # Various sources - Climate change impacts (annual). # - data://grapher/climate/2024-03-11/climate_change_impacts_monthly: - - data://garden/climate/2024-03-11/climate_change_impacts + data://grapher/climate/2024-04-17/climate_change_impacts_annual: + - data://garden/climate/2024-04-17/climate_change_impacts # - # Various sources - Climate change impacts (annual). + # Various sources - Climate change impacts (monthly). # - data://grapher/climate/2024-03-11/climate_change_impacts_annual: - - data://garden/climate/2024-03-11/climate_change_impacts + data://grapher/climate/2024-04-17/climate_change_impacts_monthly: + - data://garden/climate/2024-04-17/climate_change_impacts diff --git a/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml new file mode 100644 index 00000000000..2a5bbd540b2 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml @@ -0,0 +1,24 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: Climate Change Impacts + update_period_days: 60 + +tables: + climate_change_impacts_annual: + title: Climate Change Impacts - Annual + variables: + arctic_sea_ice_extent_min: + title: Minimum Arctic sea ice extent + arctic_sea_ice_extent_max: + title: Maximum Arctic sea ice extent + antarctic_sea_ice_extent_min: + title: Minimum Antarctic sea ice extent + antarctic_sea_ice_extent_max: + title: Maximum Antarctic sea ice extent + climate_change_impacts_monthly: + title: Climate Change Impacts - Monthly diff --git a/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py new file mode 100644 index 00000000000..38f00ffd808 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py @@ -0,0 +1,174 @@ +"""Create a garden dataset with all climate change impacts data. + +""" + +from owid.catalog import Table +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_sea_ice_extent(tb_nsidc: Table) -> Table: + tb_nsidc = tb_nsidc.copy() + # Create a table with the minimum and maximum Arctic sea ice extent. + # Assume minimum and maximum occur in September and February every year. + tb_nsidc["month"] = tb_nsidc["date"].astype(str).str[5:7] + tb_nsidc["year"] = tb_nsidc["date"].astype(str).str[0:4].astype(int) + arctic_sea_ice_extent = ( + tb_nsidc[(tb_nsidc["location"] == "Northern Hemisphere") & (tb_nsidc["month"].isin(["02", "09"]))] + .pivot(index=["location", "year"], columns=["month"], values="sea_ice_extent", join_column_levels_with=" ") + .rename(columns={"02": "arctic_sea_ice_extent_max", "09": "arctic_sea_ice_extent_min"}, errors="raise") + ) + # Instead of calling the location a generic "Northern Hemisphere", call it "Arctic Ocean". + arctic_sea_ice_extent["location"] = "Arctic Ocean" + + # Idem for the Antarctic sea ice extent. + # Assume maximum and minimum occur in September and February every year. + antarctic_sea_ice_extent = ( + tb_nsidc[(tb_nsidc["location"] == "Southern Hemisphere") & (tb_nsidc["month"].isin(["02", "09"]))] + .pivot(index=["location", "year"], columns=["month"], values="sea_ice_extent", join_column_levels_with=" ") + .rename(columns={"02": "antarctic_sea_ice_extent_min", "09": "antarctic_sea_ice_extent_max"}, errors="raise") + ) + # Instead of calling the location a generic "Southern Hemisphere", call it "Antarctica". + antarctic_sea_ice_extent["location"] = "Antarctica" + + return arctic_sea_ice_extent, antarctic_sea_ice_extent + + +def prepare_ocean_heat_content(tb_ocean_heat_annual: Table, tb_ocean_heat_annual_epa: Table) -> Table: + # Combine NOAA's annual data on ocean heat content (which is more up-to-date) with the analogous EPA's data based on + # NOAA (which, for some reason, spans a longer time range for 2000m). Prioritize NOAA's data on common years. + tb_ocean_heat_annual = combine_two_overlapping_dataframes( + tb_ocean_heat_annual.rename( + columns={ + "ocean_heat_content_700m": "ocean_heat_content_noaa_700m", + "ocean_heat_content_2000m": "ocean_heat_content_noaa_2000m", + }, + errors="raise", + ), + tb_ocean_heat_annual_epa, + index_columns=["location", "year"], + ) + # Recover the original indicator titles (they are empty because of combining two columns with different titles). + tb_ocean_heat_annual["ocean_heat_content_noaa_700m"].metadata.title = tb_ocean_heat_annual_epa[ + "ocean_heat_content_noaa_700m" + ].metadata.title + tb_ocean_heat_annual["ocean_heat_content_noaa_2000m"].metadata.title = tb_ocean_heat_annual_epa[ + "ocean_heat_content_noaa_2000m" + ].metadata.title + + return tb_ocean_heat_annual + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load GISS dataset surface temperature analysis, and read monthly data. + ds_giss = paths.load_dataset("surface_temperature_analysis") + tb_giss = ds_giss["surface_temperature_analysis"].reset_index() + + # Load NSIDC dataset of sea ice index. + ds_nsidc = paths.load_dataset("sea_ice_index") + tb_nsidc = ds_nsidc["sea_ice_index"].reset_index() + + # Load Met Office dataset on sea surface temperature. + ds_met_office = paths.load_dataset("sea_surface_temperature") + tb_met_office = ds_met_office["sea_surface_temperature"].reset_index() + + # Load NOAA/NCIE dataset on ocean heat content. + ds_ocean_heat = paths.load_dataset("ocean_heat_content", namespace="climate") + tb_ocean_heat_monthly = ds_ocean_heat["ocean_heat_content_monthly"].reset_index() + tb_ocean_heat_annual = ds_ocean_heat["ocean_heat_content_annual"].reset_index() + + # Load EPA's compilation of data on ocean heat content. + ds_epa = paths.load_dataset("ocean_heat_content", namespace="epa") + tb_ocean_heat_annual_epa = ds_epa["ocean_heat_content"].reset_index() + + # Load ocean pH data from the School of Ocean and Earth Science and Technology. + ds_ocean_ph = paths.load_dataset("ocean_ph_levels") + tb_ocean_ph = ds_ocean_ph["ocean_ph_levels"].reset_index() + + # Load snow cover extent from Rutgers University Global Snow Lab. + ds_snow = paths.load_dataset("snow_cover_extent") + tb_snow = ds_snow["snow_cover_extent"].reset_index() + + # Load ice sheet mass balance data from EPA. + ds_ice_sheet = paths.load_dataset("ice_sheet_mass_balance") + tb_ice_sheet = ds_ice_sheet["ice_sheet_mass_balance"].reset_index() + + # Load annual data on mass balance of US glaciers from EPA. + ds_us_glaciers = paths.load_dataset("mass_balance_us_glaciers") + tb_us_glaciers = ds_us_glaciers["mass_balance_us_glaciers"].reset_index() + + # Load monthly greenhouse gas concentration data from NOAA/GML. + ds_gml = paths.load_dataset("ghg_concentration") + tb_gml = ds_gml["ghg_concentration"].reset_index() + + # Load long-run yearly greenhouse gas concentration data. + ds_ghg = paths.load_dataset("long_run_ghg_concentration") + tb_ghg = ds_ghg["long_run_ghg_concentration"].reset_index() + + # Load global sea level. + ds_sea_level = paths.load_dataset("global_sea_level") + tb_sea_level = ds_sea_level["global_sea_level"].reset_index() + + # + # Process data. + # + # Prepare sea ice extent data. + arctic_sea_ice_extent, antarctic_sea_ice_extent = prepare_sea_ice_extent(tb_nsidc=tb_nsidc) + + # Prepare ocean heat content data. + tb_ocean_heat_annual = prepare_ocean_heat_content( + tb_ocean_heat_annual=tb_ocean_heat_annual, tb_ocean_heat_annual_epa=tb_ocean_heat_annual_epa + ) + + # Gather monthly data from different tables. + tb_monthly = tb_giss.astype({"date": str}).copy() + # NOTE: The values in tb_ocean_ph are monthly, but the dates are not consistently on the middle of the month. + # Instead, they are on different days of the month. When merging with other tables, this will create many nans. + # We could reindex linearly, but it's not a big deal. + for table in [ + tb_nsidc, + tb_met_office, + tb_ocean_heat_monthly, + tb_ocean_ph, + tb_snow, + tb_ice_sheet, + tb_gml, + tb_sea_level, + ]: + tb_monthly = tb_monthly.merge( + table.astype({"date": str}), + how="outer", + on=["location", "date"], + validate="one_to_one", + short_name="climate_change_impacts_monthly", + ) + + # Gather annual data from different tables. + tb_annual = tb_ocean_heat_annual.copy() + for table in [arctic_sea_ice_extent, antarctic_sea_ice_extent, tb_ghg, tb_us_glaciers.astype({"year": int})]: + tb_annual = tb_annual.merge( + table, + how="outer", + on=["location", "year"], + validate="one_to_one", + short_name="climate_change_impacts_annual", + ) + tb_annual.metadata.short_name = "climate_change_impacts_annual" + + # Set an appropriate index to monthly and annual tables, and sort conveniently. + tb_monthly = tb_monthly.set_index(["location", "date"], verify_integrity=True).sort_index() + tb_annual = tb_annual.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create explorer dataset with combined table in csv format. + ds_explorer = create_dataset(dest_dir, tables=[tb_annual, tb_monthly]) + ds_explorer.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml new file mode 100644 index 00000000000..ca5e6073998 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml @@ -0,0 +1,44 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + description_short: Measured in parts per million. + +dataset: + update_period_days: 60 + +tables: + ghg_concentration: + title: Monthly greenhouse gas concentration + variables: + co2_concentration: + title: Monthly concentration of atmospheric carbon dioxide + processing_level: minor + unit: parts per million + short_unit: ppm + ch4_concentration: + title: Monthly concentration of atmospheric methane + processing_level: minor + unit: parts per billion + short_unit: ppb + n2o_concentration: + title: Monthly concentration of atmospheric nitrous oxide + processing_level: minor + unit: parts per billion + short_unit: ppb + co2_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric carbon dioxide + processing_level: major + unit: parts per million + short_unit: ppm + ch4_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric methane + processing_level: major + unit: parts per billion + short_unit: ppb + n2o_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric nitrous oxide + processing_level: major + unit: parts per billion + short_unit: ppb diff --git a/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py new file mode 100644 index 00000000000..36d76ea290b --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py @@ -0,0 +1,143 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from typing import List + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "year": "year", + "month": "month", + "average": "concentration", + # The following column is loaded only to perform a sanity check. + "decimal": "decimal", +} + + +def add_rolling_average(tb: Table, original_column_names: List[str]) -> Table: + tb_with_average = tb.copy() + + # Create a date range of each month (on the 15th). + # NOTE: The minimum date in the data is "2001-01-15", however, when passing this date to pd.date_range with + # freq="MS", the first point is dismissed because it is not the start of a month. For that reason, we shift the + # first point to be at the beginning of the month. + date_range = pd.date_range( + start=tb_with_average["date"].min() - pd.tseries.offsets.MonthBegin(1), + end=tb_with_average["date"].max(), + freq="MS", + ) + pd.DateOffset(days=14) + + # Get unique locations. + unique_locations = tb_with_average["location"].unique() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index(["location", "date"]).sort_index() + + # Create a MultiIndex with all possible combinations of date and location. + multi_index = pd.MultiIndex.from_product([unique_locations, date_range], names=["location", "date"]) + + # Reindex using the MultiIndex. + tb_with_average = tb_with_average.reindex(multi_index) + + for original_column_name in original_column_names: + # Create a rolling average with a window of one year, linearly interpolating missing values. + # NOTE: Currently no interpolation is needed, as no data points are missing (and in fact date_range is identical + # to the dates in the data). However, we need to interpolate in case there are missing points. Otherwise all + # points after the missing one will be nan. + tb_with_average[f"{original_column_name}_yearly_average"] = ( + tb_with_average[original_column_name] + .interpolate("linear") + .rolling(12) + .mean() + .copy_metadata(tb_with_average[original_column_name]) + ) + + # Drop empty rows. + tb_with_average = tb_with_average.dropna(subset=original_column_names, how="all").reset_index() + + # Sort conveniently. + tb_with_average = tb_with_average.sort_values(["location", "date"]).reset_index(drop=True) + + for original_column_name in original_column_names: + # Check that the values of the original column have not been altered. + error = f"The values of the original {original_column_name} column have been altered." + assert tb_with_average[original_column_name].astype(float).equals(tb[original_column_name].astype(float)), error + + return tb_with_average + + +def prepare_gas_data(tb: Table) -> Table: + tb = tb.copy() + + # Extract gas name from table's short name. + gas = tb.metadata.short_name.split("_")[0] + + # Columns to select from the data, and how to rename them. + columns = { + "year": "year", + "month": "month", + "average": f"{gas}_concentration", + # The following column is loaded only to perform a sanity check. + "decimal": "decimal", + } + + # Select necessary columns and rename them. + tb = tb[list(columns)].rename(columns=columns, errors="raise") + + # There is a "decimal" column for the year as a decimal number, that only has 12 possible values, corresponding to + # the middle of each month, so we will assume the 15th of each month. + error = "Date format has changed." + assert len(set(tb["decimal"].astype(str).str.split(".").str[1])) == 12, error + assert set(tb["month"]) == set(range(1, 13)), error + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=15)) + + # Remove unnecessary columns. + tb = tb.drop(columns=["year", "month", "decimal"], errors="raise") + + # Add a location column. + tb["location"] = "World" + + # Add a column with a rolling average for each gas. + tb = add_rolling_average(tb=tb, original_column_names=[f"{gas}_concentration"]) + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("ghg_concentration") + tb_co2 = ds_meadow["co2_concentration_monthly"].reset_index() + tb_ch4 = ds_meadow["ch4_concentration_monthly"].reset_index() + tb_n2o = ds_meadow["n2o_concentration_monthly"].reset_index() + + # + # Process data. + # + # Prepare data for each gas. + tb_co2 = prepare_gas_data(tb=tb_co2) + tb_ch4 = prepare_gas_data(tb=tb_ch4) + tb_n2o = prepare_gas_data(tb=tb_n2o) + + # Combine data for different gases. + tb = tb_co2.merge(tb_ch4, how="outer", on=["location", "date"]).merge( + tb_n2o, how="outer", on=["location", "date"], short_name=paths.short_name + ) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml new file mode 100644 index 00000000000..b02cba814ea --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml @@ -0,0 +1,27 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + - CO2 & Greenhouse Gas Emissions + description_processing: |- + - Long-run data from ice core studies has been merged with recent measurements of atmospheric concentration of greenhouse gases. + +dataset: + update_period_days: 0 + +tables: + long_run_ghg_concentration: + variables: + co2_concentration: + title: Long-run CO₂ concentration + unit: parts per million volume + short_unit: ppmv + ch4_concentration: + title: Long-run CH₄ concentration + unit: parts per billion volume + short_unit: ppbv + n2o_concentration: + title: Long-run N₂O concentration + unit: parts per billion volume + short_unit: ppbv diff --git a/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py new file mode 100644 index 00000000000..0e07095b425 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py @@ -0,0 +1,84 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def convert_monthly_to_annual(tb_new: Table) -> Table: + tb_new = tb_new.copy() + + # Create a year column. + tb_new["year"] = tb_new["date"].dt.year + + # Create a table with the number of observations per year. + tb_counts = tb_new.groupby("year", as_index=False).agg( + { + "co2_concentration": "count", + "ch4_concentration": "count", + "n2o_concentration": "count", + } + ) + # Create a table with the average annual values. + tb_new = tb_new.groupby("year", as_index=False).agg( + { + "co2_concentration": "mean", + "ch4_concentration": "mean", + "n2o_concentration": "mean", + } + ) + # Make nan all data points based on less than 12 observations per year. + for gas in ["co2", "ch4", "n2o"]: + tb_new.loc[tb_counts[f"{gas}_concentration"] < 12, f"{gas}_concentration"] = None + + # Drop empty rows. + tb_new = tb_new.dropna( + subset=["co2_concentration", "ch4_concentration", "n2o_concentration"], how="all" + ).reset_index(drop=True) + + return tb_new + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset on long-run GHG concentrations from EPA, and read its main table. + ds_old = paths.load_dataset("ghg_concentration", namespace="epa") + tb_old = ds_old["ghg_concentration"].reset_index() + + # Load garden dataset of up-to-date GHG concentrations, and read its main table. + ds_new = paths.load_dataset("ghg_concentration", namespace="climate") + tb_new = ds_new["ghg_concentration"].reset_index() + + # + # Process data. + # + # Select columns. + tb_new = tb_new[["date", "co2_concentration", "ch4_concentration", "n2o_concentration"]].copy() + + # Calculate average annual values. + tb_new = convert_monthly_to_annual(tb_new=tb_new) + + # Combine old and new data, prioritizing the latter. + tb = combine_two_overlapping_dataframes(df1=tb_new, df2=tb_old, index_columns=["year"]) + + # Rename table. + tb.metadata.short_name = paths.short_name + + # Add location column. + tb["location"] = "World" + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml new file mode 100644 index 00000000000..c7f6fb474ea --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml @@ -0,0 +1,29 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + description_short: Measured in 10²² Joules. + unit: 10²² Joules + short_unit: 10²² J + +dataset: + title: Ocean Heat Content + update_period_days: 60 + +tables: + ocean_heat_content_monthly: + title: Ocean Heat Content - Monthly average + variables: + ocean_heat_content_700m: + title: Monthly average ocean heat content for the 0-700 meters layer + ocean_heat_content_2000m: + title: Monthly average ocean heat content for the 0-2000 meters layer + ocean_heat_content_annual: + title: Ocean Heat Content - Annual average + variables: + ocean_heat_content_700m: + title: Annual average ocean heat content for the 0-700 meters layer + ocean_heat_content_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py new file mode 100644 index 00000000000..dcbafe0d14c --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py @@ -0,0 +1,45 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("ocean_heat_content") + tb_monthly = ds_meadow["ocean_heat_content_monthly"].reset_index() + tb_annual = ds_meadow["ocean_heat_content_annual"].reset_index() + + # + # Process data. + # + # Improve the format of the date column in monthly date (assume the middle of the month for each data point). + tb_monthly["date"] = ( + tb_monthly["date"].str.split("-").str[0] + "-" + tb_monthly["date"].str.split("-").str[1].str.zfill(2) + "-15" + ) + + # Replace date column (where all years are given as, e.g. 1955.5, 2000.5) by year column in annual data. + tb_annual["year"] = tb_annual["date"].astype(int) + tb_annual = tb_annual.drop(columns=["date"], errors="raise") + + # Instead of having a column for depth, create columns of heat content for each depth. + tb_monthly["depth"] = tb_monthly["depth"].astype(str) + "m" + tb_monthly = tb_monthly.pivot(index=["location", "date"], columns="depth", join_column_levels_with="_") + tb_annual["depth"] = tb_annual["depth"].astype(str) + "m" + tb_annual = tb_annual.pivot(index=["location", "year"], columns="depth", join_column_levels_with="_") + + # Set an appropriate index to each table and sort conveniently. + tb_monthly = tb_monthly.set_index(["location", "date"], verify_integrity=True).sort_index() + tb_annual = tb_annual.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb_annual, tb_monthly], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml new file mode 100644 index 00000000000..d9364bd3280 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml @@ -0,0 +1,22 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + unit: pH + short_unit: pH + +dataset: + title: Ocean pH Levels + update_period_days: 60 + +tables: + ocean_ph_levels: + title: Ocean pH levels + variables: + ocean_ph: + title: Monthly measurement of ocean pH levels + processing_level: minor + ocean_ph_yearly_average: + title: Rolling yearly average of ocean pH levels + processing_level: major diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py new file mode 100644 index 00000000000..db98a40272e --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py @@ -0,0 +1,86 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "date": "date", + "phcalc_insitu": "ocean_ph", +} + + +def add_rolling_average(tb: Table) -> Table: + tb_with_average = tb.copy() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index("date").sort_index() + + # Since values are given at different days of the month, reindex to have a value for each day. + tb_with_average = tb_with_average.reindex( + pd.date_range(start=tb_with_average.index.min(), end=tb_with_average.index.max(), freq="1D") + ) + + # Create a rolling average with a window of one year, linearly interpolating missing values. + tb_with_average["ocean_ph_yearly_average"] = ( + tb_with_average["ocean_ph"] + .interpolate(method="time") + .rolling(365) + .mean() + .copy_metadata(tb_with_average["ocean_ph"]) + ) + + # Drop empty rows. + tb_with_average = ( + tb_with_average.dropna(subset=["ocean_ph"]).reset_index().rename(columns={"index": "date"}, errors="raise") + ) + + # Check that the values of the original ocean ph column have not been altered. + error = "The values of the original ocean_ph column have been altered." + assert tb_with_average["ocean_ph"].equals( + tb.dropna(subset=["ocean_ph"]).sort_values("date").reset_index(drop=True)["ocean_ph"] + ), error + + return tb_with_average + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("hawaii_ocean_time_series") + tb_meadow = ds_meadow["hawaii_ocean_time_series"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb = tb_meadow[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Add location column. + tb["location"] = "Hawaii" + + # Improve format of date column. + tb["date"] = pd.to_datetime(tb["date"], format="%d-%b-%y") + + # Add a column with a rolling average. + tb = add_rolling_average(tb=tb) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # Rename table. + tb.metadata.short_name = paths.short_name + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml new file mode 100644 index 00000000000..7facebf9240 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml @@ -0,0 +1,19 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: Sea Ice Index + update_period_days: 60 + +tables: + sea_ice_index: + variables: + sea_ice_extent: + title: Sea ice extent + # description_short: TODO + unit: million square kilometers + short_unit: million km² + processing_level: minor diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py new file mode 100644 index 00000000000..3f8247e42b5 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py @@ -0,0 +1,44 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("sea_ice_index") + + # Read table from meadow dataset. + tb = ds_meadow["sea_ice_index"].reset_index() + + # + # Process data. + # + # Remove column with annual average. + tb = tb.drop(columns=["annual"]) + + # Convert table to long format. + tb = tb.melt(id_vars=["location", "year"], var_name="month", value_name="sea_ice_extent") + + # Create column of date, assuming each measurement is taken mid month. + tb["date"] = pd.to_datetime(tb["year"].astype(str) + tb["month"].str[0:3] + "15", format="%Y%b%d") + + # Drop empty rows and unnecessary columns. + tb = tb.dropna().drop(columns=["year", "month"]) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml new file mode 100644 index 00000000000..bf9ee9d13dc --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml @@ -0,0 +1,29 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + +dataset: + title: Sea surface temperature + update_period_days: 60 + +tables: + sea_surface_temperature: + variables: + sea_temperature_anomaly: + title: "Monthly sea surface temperature anomaly" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C + sea_temperature_anomaly_low: + title: "Monthly sea surface temperature anomaly (lower bound)" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C + sea_temperature_anomaly_high: + title: "Monthly sea surface temperature anomaly (upper bound)" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py new file mode 100644 index 00000000000..2c2fb56098e --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py @@ -0,0 +1,48 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +# Columns to select from data, and how to rename them. +COLUMNS = { + "year": "year", + "month": "month", + "location": "location", + "anomaly": "sea_temperature_anomaly", + "lower_bound_95pct_bias_uncertainty_range": "sea_temperature_anomaly_low", + "upper_bound_95pct_bias_uncertainty_range": "sea_temperature_anomaly_high", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("sea_surface_temperature") + tb = ds_meadow["sea_surface_temperature"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Create a date column (assume the middle of the month for each monthly data point). + tb["date"] = tb["year"].astype(str) + "-" + tb["month"].astype(str).str.zfill(2) + "-15" + + # Remove unnecessary columns. + tb = tb.drop(columns=["year", "month"], errors="raise") + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml new file mode 100644 index 00000000000..698ad73c63f --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml @@ -0,0 +1,23 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + unit: "square kilometers" + short_unit: "km²" + description_short: Measured in square kilometers. + +dataset: + title: Snow Cover Extent + update_period_days: 60 + +tables: + snow_cover_extent: + title: Snow Cover Extent + variables: + snow_cover_extent: + title: Monthly measurement of the area covered by snow + processing_level: minor + snow_cover_extent_yearly_average: + title: Rolling yearly average of the area covered by snow + processing_level: major diff --git a/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py new file mode 100644 index 00000000000..f5b5d039b34 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py @@ -0,0 +1,97 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "date": "date", + "phcalc_insitu": "ocean_ph", +} + + +def add_rolling_average(tb: Table, original_column_name: str) -> Table: + tb_with_average = tb.copy() + + # Create a date range. + date_range = pd.date_range(start=tb_with_average["date"].min(), end=tb_with_average["date"].max(), freq="1D") + + # Get unique locations. + unique_locations = tb_with_average["location"].unique() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index(["location", "date"]).sort_index() + + # Create a MultiIndex with all possible combinations of date and location. + multi_index = pd.MultiIndex.from_product([unique_locations, date_range], names=["location", "date"]) + + # Reindex using the MultiIndex. + tb_with_average = tb_with_average.reindex(multi_index) + + # Create a rolling average with a window of one year, linearly interpolating missing values. + tb_with_average[f"{original_column_name}_yearly_average"] = ( + tb_with_average[original_column_name] + .interpolate(method="linear") + .rolling(365) + .mean() + .copy_metadata(tb_with_average[original_column_name]) + ) + + # Drop empty rows. + tb_with_average = tb_with_average.dropna(subset=[original_column_name]).reset_index() + + # Remove rolling average for the first year, given that it is based on incomplete data. + tb_with_average.loc[ + tb_with_average["date"] < tb_with_average["date"].min() + pd.Timedelta(days=365), + f"{original_column_name}_yearly_average", + ] = None + + # Sort conveniently. + tb_with_average = tb_with_average.sort_values(["location", "date"]).reset_index(drop=True) + + # Check that the values of the original column have not been altered. + error = f"The values of the original {original_column_name} column have been altered." + assert tb_with_average[original_column_name].astype(int).equals(tb[original_column_name].astype(int)), error + + return tb_with_average + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("snow_cover_extent") + tb = ds_meadow["snow_cover_extent"].reset_index() + + # + # Process data. + # + # Create a date column. + # NOTE: Assign the middle of the month. + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=15)) + tb = tb.drop(columns=["year", "month"], errors="raise") + + # Data starts in 1966, but, as mentioned on their website + # https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0 + # there is missing data between 1968 and 1971. + # So, for simplicity, select data from 1972 onwards, where data is complete. + tb = tb[tb["date"] >= "1972-01-01"].reset_index(drop=True) + + # Add a column with a rolling average. + tb = add_rolling_average(tb=tb, original_column_name="snow_cover_extent") + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml new file mode 100644 index 00000000000..eda07f5ae5a --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml @@ -0,0 +1,20 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: GISS surface temperature analysis + update_period_days: 60 + +tables: + surface_temperature_analysis: + variables: + temperature_anomaly: + title: "Global warming: monthly temperature anomaly" + description_short: |- + Combined land-surface air and sea-surface water temperature anomaly, given as the deviation from the 1951-1980 mean, in degrees Celsius. + unit: °C + short_unit: °C + processing_level: minor diff --git a/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py new file mode 100644 index 00000000000..43d328abbde --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py @@ -0,0 +1,56 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("surface_temperature_analysis") + tb = ds_meadow["surface_temperature_analysis_world"] + + # + # Process data. + # + # Initialize dictionary to store processed tables. + tables = {} + for table_name in ds_meadow.table_names: + # Read table. + tb = ds_meadow[table_name].reset_index() + # Get location from table name. + location = table_name.split("surface_temperature_analysis_")[-1].replace("_", " ").title() + # Add column for location. + tb["location"] = location + # Convert table to long format. + tb = tb.melt(id_vars=["year", "location"], var_name="month", value_name="temperature_anomaly") + # Create column of date, assuming each measurement is taken mid month. + tb["date"] = pd.to_datetime(tb["year"].astype(str) + tb["month"] + "15", format="%Y%b%d") + # Copy metadata from any other previous column. + tb["date"] = tb["date"].copy_metadata(tb["location"]) + # Select necessary columns. + tb = tb[["location", "date", "temperature_anomaly"]] + # Remove rows with missing values. + tb = tb.dropna(subset=["temperature_anomaly"]).reset_index(drop=True) + # Update table. + tables[location] = tb + + # Concatenate all tables. + tb = pr.concat(list(tables.values()), ignore_index=True, short_name=paths.short_name) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml new file mode 100644 index 00000000000..933924e021d --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml @@ -0,0 +1,30 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: major + description_key: + - Based on ice core studies of historical concentration of greenhouse gases, and recent air monitoring sites around the world. + description_from_producer: |- + This indicator describes how the levels of major greenhouse gases (GHGs) in the atmosphere have changed over geological time and in recent years. Changes in atmospheric GHGs, in part caused by human activities, affect the amount of energy held in the Earth-atmosphere system and thus affect the Earth's climate. This indicator is highly relevant to climate change because greenhouse gases from human activities are the primary driver of observed climate change since the mid-20th century (IPCC, 2021). + +dataset: + update_period_days: 0 + +tables: + ghg_concentration: + title: Global Atmospheric Concentrations of Greenhouse Gases + variables: + co2_concentration: + title: Global atmospheric concentration of carbon dioxide + unit: parts per million + short_unit: ppm + ch4_concentration: + title: Global atmospheric concentration of methane + unit: parts per billion + short_unit: ppb + n2o_concentration: + title: Global atmospheric concentration of nitrous oxide + unit: parts per billion + short_unit: ppb diff --git a/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py new file mode 100644 index 00000000000..e244a717be8 --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py @@ -0,0 +1,75 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to read from the data, and how to rename them. +COLUMNS_CO2 = { + "year": "year", + "antarctic_ice_cores": "co2_concentration", +} +COLUMNS_CH4 = { + "year": "year", + "epica_dome_c__antarctica": "ch4_concentration", +} +COLUMNS_N2O = { + "year": "year", + "epica_dome_c__antarctica": "n2o_concentration", +} + + +def approximate_data_for_each_year(tb: Table, column: str) -> Table: + tb = tb.copy() + + # Round each year to its closer integer. + tb["year"] = tb["year"].round(0).astype(int) + + # If there are multiple rows for a given year, take the average value. + tb = tb.groupby("year", as_index=False).agg({column: "mean"}) + + # Remove empty rows. + tb = tb.dropna(subset=[column]).reset_index(drop=True) + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("ghg_concentration") + tb_co2 = ds_meadow["co2_concentration"].reset_index() + tb_ch4 = ds_meadow["ch4_concentration"].reset_index() + tb_n2o = ds_meadow["n2o_concentration"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb_co2 = tb_co2[list(COLUMNS_CO2)].rename(columns=COLUMNS_CO2, errors="raise") + tb_ch4 = tb_ch4[list(COLUMNS_CH4)].rename(columns=COLUMNS_CH4, errors="raise") + tb_n2o = tb_n2o[list(COLUMNS_N2O)].rename(columns=COLUMNS_N2O, errors="raise") + + # Since pandas datetime cannot handle such long past dates, for simplicity, round up years, and take average + # concentration of year for which there are multiple rows. + tb_co2 = approximate_data_for_each_year(tb_co2, "co2_concentration") + tb_ch4 = approximate_data_for_each_year(tb_ch4, "ch4_concentration") + tb_n2o = approximate_data_for_each_year(tb_n2o, "n2o_concentration") + + # Combine data for all gases. + tb = tb_co2.merge(tb_ch4, on="year", how="outer").merge(tb_n2o, on="year", how="outer", short_name=paths.short_name) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml new file mode 100644 index 00000000000..d7791eb36f8 --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml @@ -0,0 +1,31 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: major + unit: billion tonnes + short_unit: billion t + description_key: + - Values are centered at zero in 2002 to provide a consistent point of reference. + - A downward slope indicates a net loss of ice and snow. + - For reference, 1,000 billion metric tons is enough to raise sea level by about 3 millimeters. + +dataset: + title: Ice Sheet Mass Balance + update_period_days: 0 + +tables: + ice_sheet_mass_balance: + title: Ice Sheet Mass Balance + variables: + cumulative_ice_mass_change_imbie: + title: Cumulative change in mass in the ice sheets, according to IMBIE + description_short: Measured in billion tonnes. Based on more than 20 different studies that have been combined for each region. + presentation: + title_variant: IMBIE + land_ice_mass_nasa: + title: Cumulative change in mass in the ice sheets, according to NASA/JPL + description_short: Measured in billion tonnes. + presentation: + title_variant: NASA/JPL diff --git a/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py new file mode 100644 index 00000000000..8c03e21269c --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py @@ -0,0 +1,91 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def fix_multiple_rows_for_the_same_year(tb: Table) -> Table: + # There are repeated years, but there are no ambiguities (i.e. for each column, either the first or the second + # repeated year has data, not both). + # To fix that, remove nans from each column and merge them together. + tb_corrected = tb[["year"]].drop_duplicates().reset_index(drop=True) + for column in tb.columns[1:]: + tb_column = tb[["year", column]].dropna().reset_index(drop=True) + assert tb_column[tb_column.duplicated(subset="year", keep=False)].empty + tb_corrected = tb_corrected.merge(tb_column, how="outer", on="year") + + return tb_corrected + + +def decimal_date_to_date(year: int) -> str: + return (pd.to_datetime(year, format="%Y") + pd.Timedelta(days=(year % 1) * 364.2425)).date() + + +def separate_antarctica_and_greenland_data(tb: Table) -> Table: + columns_antarctica = { + "date": "date", + "nasa__antarctica_land_ice_mass": "land_ice_mass_nasa", + "imbie__antarctica_cumulative_ice_mass_change": "cumulative_ice_mass_change_imbie", + } + tb_antarctica = ( + tb[list(columns_antarctica)] + .rename(columns=columns_antarctica, errors="raise") + .assign(**{"location": "Antarctica"}) + .copy() + ) + columns_greenland = { + "date": "date", + "nasa__greenland_land_ice_mass": "land_ice_mass_nasa", + "imbie__greenland_cumulative_ice_mass_change": "cumulative_ice_mass_change_imbie", + } + tb_greenland = ( + tb[list(columns_greenland)] + .rename(columns=columns_greenland, errors="raise") + .assign(**{"location": "Greenland"}) + .copy() + ) + + # Combine data for Antarctica and Greenland. + tb_combined = pr.concat([tb_antarctica, tb_greenland], ignore_index=True) + + return tb_combined + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("ice_sheet_mass_balance") + tb = ds_meadow["ice_sheet_mass_balance"].reset_index() + + # + # Process data. + # + # Fix issue with the original data, where there are multiple rows for the same year. + tb = fix_multiple_rows_for_the_same_year(tb=tb) + + # Remove empty rows. + tb = tb.dropna(how="all") + + # Create a date column (given that "year" is given with decimals). + tb["date"] = tb["year"].apply(decimal_date_to_date).astype(str) + + # Separate data for Antarctica and Greenland. + tb = separate_antarctica_and_greenland_data(tb=tb) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml new file mode 100644 index 00000000000..db21db39b00 --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml @@ -0,0 +1,17 @@ +dataset: + update_period_days: 0 + +tables: + mass_balance_us_glaciers: + title: Mass Balance of Glaciers in the United States + variables: + mass_balance_us_glaciers: + title: Cumulative mass balance + unit: meters + short_unit: m + description_short: |- + Measured in meters of water equivalent, which represent changes in the average thickness of a glacier relative to a base year 1965. + presentation: + topic_tags: + - Climate Change + processing_level: minor diff --git a/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py new file mode 100644 index 00000000000..87faf60c2cf --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py @@ -0,0 +1,39 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("mass_balance_us_glaciers") + tb = ds_meadow["mass_balance_us_glaciers"].reset_index() + + # + # Process data. + # + # Change column names to human-readable names. + tb = tb.rename( + columns={column: column.replace("_", " ").title() for column in tb.columns if column != "year"}, errors="raise" + ) + + # Transpose table to have location as a column. + tb = tb.melt(id_vars=["year"], var_name="location", value_name="mass_balance_us_glaciers") + + # Remove empty rows. + tb = tb.dropna().reset_index(drop=True) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml new file mode 100644 index 00000000000..0eec41a7f0e --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml @@ -0,0 +1,34 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + description_processing: |- + The amount of heat in the ocean, or ocean heat content, is an important indicator of climate change because the oceans ultimately absorb a large portion of the extra energy that greenhouse gases trap near the Earth's surface. Ocean heat content also plays an important role in the Earth's climate system because heat from ocean surface waters provides energy for storms and thereby influences weather patterns. + description_short: Measured in 10²² Joules. + unit: 10²² Joules + short_unit: 10²² J + +dataset: + title: Ocean Heat Content + update_period_days: 0 + +tables: + ocean_heat_content: + title: Ocean Heat Content + variables: + ocean_heat_content_iap_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer, according to IAP + ocean_heat_content_noaa_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer, according to NOAA + ocean_heat_content_mri_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer, according to MRI/JMA + ocean_heat_content_mri_700m: + title: Annual average ocean heat content for the 0-700 meters layer, according to MRI/JMA + ocean_heat_content_noaa_700m: + title: Annual average ocean heat content for the 0-700 meters layer, according to NOAA + ocean_heat_content_iap_700m: + title: Annual average ocean heat content for the 0-700 meters layer, according to IAP + ocean_heat_content_csiro_700m: + title: Annual average ocean heat content for the 0-700 meters layer, according to CSIRO diff --git a/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py new file mode 100644 index 00000000000..df33d15ae7e --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py @@ -0,0 +1,35 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("ocean_heat_content") + tb = ds_meadow["ocean_heat_content"].reset_index() + + # + # Process data. + # + # Instead of having a column for depth, create columns of heat content for each depth. + tb["depth"] = tb["depth"].astype(str) + "m" + tb = tb.pivot(index=["location", "year"], columns="depth", join_column_levels_with="_") + + # Delete columns with no data. + tb = tb.dropna(how="all", axis=1).reset_index(drop=True) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_annual.py b/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_annual.py new file mode 100644 index 00000000000..d2ce85e4a2d --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_annual.py @@ -0,0 +1,34 @@ +"""Load a garden dataset and create a grapher dataset. + +""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its annual table. + ds_garden = paths.load_dataset("climate_change_impacts") + tb_annual = ds_garden["climate_change_impacts_annual"].reset_index() + + # + # Process data. + # + # Create a country column (required by grapher). + tb_annual = tb_annual.rename(columns={"location": "country"}, errors="raise") + + # Set an appropriate index and sort conveniently. + tb_annual = tb_annual.set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb_annual], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_monthly.py b/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_monthly.py new file mode 100644 index 00000000000..c69428bae1b --- /dev/null +++ b/etl/steps/data/grapher/climate/2024-04-17/climate_change_impacts_monthly.py @@ -0,0 +1,37 @@ +"""Load a garden dataset and create a grapher dataset. + +""" + +from etl.grapher_helpers import adapt_table_with_dates_to_grapher +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its monthly table. + ds_garden = paths.load_dataset("climate_change_impacts") + tb = ds_garden["climate_change_impacts_monthly"].reset_index() + + # + # Process data. + # + # Create a country column (required by grapher). + tb = tb.rename(columns={"location": "country"}, errors="raise") + + # Adapt table with dates to grapher requirements. + tb = adapt_table_with_dates_to_grapher(tb) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/meadow/climate/2024-04-17/ghg_concentration.py b/etl/steps/data/meadow/climate/2024-04-17/ghg_concentration.py new file mode 100644 index 00000000000..1ca24557052 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-04-17/ghg_concentration.py @@ -0,0 +1,42 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "co2_concentration_monthly", + "ch4_concentration_monthly", + "n2o_concentration_monthly", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Initialize dictionary to store raw tables. + tables = {} + for file_name in FILES: + # Retrieve snapshot. + snap = paths.load_snapshot(f"{file_name}.csv") + + # Load data from snapshot. + tables[file_name] = snap.read(comment="#", na_values="-9.99") + + # + # Process data. + # + for file_name, tb in tables.items(): + # Set an appropriate index and sort conveniently. + tables[file_name] = tb.set_index(["year", "month"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with one table for each gas. + ds_meadow = create_dataset(dest_dir, tables=tables.values(), check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-04-17/hawaii_ocean_time_series.py b/etl/steps/data/meadow/climate/2024-04-17/hawaii_ocean_time_series.py new file mode 100644 index 00000000000..0544b0cb638 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-04-17/hawaii_ocean_time_series.py @@ -0,0 +1,29 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load snapshot and read its data. + tb = paths.load_snapshot("hawaii_ocean_time_series.csv").read(skiprows=8, sep="\t", na_values=[-999]) + + # + # Process data. + # + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["date"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-04-17/ocean_heat_content.py b/etl/steps/data/meadow/climate/2024-04-17/ocean_heat_content.py new file mode 100644 index 00000000000..844f5d34220 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-04-17/ocean_heat_content.py @@ -0,0 +1,75 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "ocean_heat_content_monthly_world_700m", + "ocean_heat_content_monthly_world_2000m", + "ocean_heat_content_annual_world_700m", + "ocean_heat_content_annual_world_2000m", +] + +# Columns to select from annual data, and how to rename them. +COLUMNS_ANNUAL = { + "YEAR": "date", + "WO": "ocean_heat_content", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data from snapshots. + tables_monthly = [] + tables_annual = [] + for file_name in FILES: + # Extract depth and location from file name. + depth = int(file_name.split("_")[-1].replace("m", "")) + location = file_name.split("_")[-2].title() + if "monthly" in file_name: + # Read data. + new_table = paths.load_snapshot(f"{file_name}.csv").read(names=["date", "ocean_heat_content"]) + # Add columns for location and depth. + new_table = new_table.assign(**{"depth": depth, "location": location}) + # Add monthly table to list. + tables_monthly.append(new_table) + elif "annual" in file_name: + # Read data, select and rename columns. + new_table = ( + paths.load_snapshot(f"{file_name}.csv") + .read_fwf()[list(COLUMNS_ANNUAL)] + .rename(columns=COLUMNS_ANNUAL, errors="raise") + ) + # Add columns for location and depth. + new_table = new_table.assign(**{"depth": depth, "location": location}) + # Add annual table to list. + tables_annual.append(new_table) + else: + raise ValueError(f"Unexpected file name: {file_name}") + + # + # Process data. + # + # Combine monthly data and add a column for location. + tb_monthly = pr.concat(tables_monthly, short_name="ocean_heat_content_monthly") + + # Combine annual data. + tb_annual = pr.concat(tables_annual, short_name="ocean_heat_content_annual") + + # Set an appropriate index and sort conveniently. + tb_monthly = tb_monthly.set_index(["location", "depth", "date"], verify_integrity=True).sort_index() + tb_annual = tb_annual.set_index(["location", "depth", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb_annual, tb_monthly], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-04-17/sea_ice_index.py b/etl/steps/data/meadow/climate/2024-04-17/sea_ice_index.py new file mode 100644 index 00000000000..d4ded1a7859 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-04-17/sea_ice_index.py @@ -0,0 +1,51 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("sea_ice_index.xlsx") + + # Read data from snapshot. + data = snap.ExcelFile() + + # + # Process data. + # + # Load sheet of northern hemisphere sea ice extent. + tb_nh = data.parse("NH-Extent").assign(**{"location": "Northern Hemisphere"}) + tb_sh = data.parse("SH-Extent").assign(**{"location": "Southern Hemisphere"}) + + # Sanity check. + assert tb_nh.iloc[0, 0] == 1978, "First cell in NH spreadsheet was expected to be 1978. Data has changed." + assert tb_sh.iloc[0, 0] == 1978, "First cell in SH spreadsheet was expected to be 1978. Data has changed." + + # Concatenate both tables. + tb = pr.concat([tb_sh, tb_nh], ignore_index=True, short_name=paths.short_name) + + # Fix column names. + tb = tb.rename(columns={tb.columns[0]: "year"}) + + # Drop empty rows and columns. + tb = tb.dropna(how="all").dropna(axis=1, how="all").reset_index(drop=True) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-04-17/sea_surface_temperature.py b/etl/steps/data/meadow/climate/2024-04-17/sea_surface_temperature.py new file mode 100644 index 00000000000..50623be8b7a --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-04-17/sea_surface_temperature.py @@ -0,0 +1,49 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "sea_surface_temperature_world", + "sea_surface_temperature_northern_hemisphere", + "sea_surface_temperature_southern_hemisphere", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data from each of the snapshots, and add a column with the region name. + tables = [ + paths.load_snapshot(f"{file_name}.csv") + .read() + .assign(**{"location": file_name.split("sea_surface_temperature_")[-1].replace("_", " ").title()}) + for file_name in FILES + ] + + # + # Process data. + # + # Concatenate all tables. + tb = pr.concat(tables) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year", "month"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb.metadata.short_name = paths.short_name + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-04-17/snow_cover_extent.py b/etl/steps/data/meadow/climate/2024-04-17/snow_cover_extent.py new file mode 100644 index 00000000000..86e0d707a8b --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-04-17/snow_cover_extent.py @@ -0,0 +1,50 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "snow_cover_extent_north_america", + "snow_cover_extent_northern_hemisphere", +] + +# Names of columns in the data. +COLUMNS = ["year", "month", "snow_cover_extent"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load snapshot and read its data. + tables = [] + for file_name in FILES: + tb = paths.load_snapshot(f"{file_name}.csv").read_fwf(names=COLUMNS) + # Add a column for location. + tb["location"] = file_name.split("snow_cover_extent_")[-1].replace("_", " ").title() + # Add table to list. + tables.append(tb) + + # + # Process data. + # + # Combine data from all tables. + tb = pr.concat(tables) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year", "month"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Update table name. + tb.metadata.short_name = paths.short_name + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/climate/2024-04-17/surface_temperature_analysis.py b/etl/steps/data/meadow/climate/2024-04-17/surface_temperature_analysis.py new file mode 100644 index 00000000000..88791a644b7 --- /dev/null +++ b/etl/steps/data/meadow/climate/2024-04-17/surface_temperature_analysis.py @@ -0,0 +1,62 @@ +"""Load a snapshot and create a meadow dataset.""" + + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "surface_temperature_analysis_world", + "surface_temperature_analysis_northern_hemisphere", + "surface_temperature_analysis_southern_hemisphere", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Initialize dictionary to store raw tables. + tables = {} + for file_name in FILES: + # Retrieve snapshot. + snap = paths.load_snapshot(f"{file_name}.csv") + + # Load data from snapshot. + tables[file_name] = snap.read( + skiprows=1, + na_values="***", + usecols=[ + "Year", + "Jan", + "Feb", + "Mar", + "Apr", + "May", + "Jun", + "Jul", + "Aug", + "Sep", + "Oct", + "Nov", + "Dec", + ], + ) + + # + # Process data. + # + for file_name, tb in tables.items(): + # Set an appropriate index and sort conveniently. + tables[file_name] = tb.set_index(["Year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=tables.values(), check_variables_metadata=True) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/etl/steps/data/meadow/epa/2024-04-17/ghg_concentration.py b/etl/steps/data/meadow/epa/2024-04-17/ghg_concentration.py new file mode 100644 index 00000000000..0d60fd23ba1 --- /dev/null +++ b/etl/steps/data/meadow/epa/2024-04-17/ghg_concentration.py @@ -0,0 +1,69 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data from snapshots. + snap_co2 = paths.load_snapshot("co2_concentration.csv") + snap_ch4 = paths.load_snapshot("ch4_concentration.csv") + snap_n2o = paths.load_snapshot("n2o_concentration.csv") + tb_co2 = snap_co2.read(skiprows=6) + tb_ch4 = snap_ch4.read(skiprows=6) + tb_n2o = snap_n2o.read(skiprows=6) + + # + # Process data. + # + # Remove first row, which simply says "Ice core measurements". + assert tb_co2.iloc[0, 0] == "Ice core measurements" + assert tb_ch4.iloc[0, 0] == "Ice core measurements" + assert tb_n2o.iloc[0, 0] == "Ice core measurements" + tb_co2 = tb_co2.iloc[1:].reset_index(drop=True) + tb_ch4 = tb_ch4.iloc[1:].reset_index(drop=True) + tb_n2o = tb_n2o.iloc[1:].reset_index(drop=True) + + # For convenience, rename year column. + tb_co2 = tb_co2.rename(columns={"Year": "year"}, errors="raise") + tb_ch4 = tb_ch4.rename(columns={"Year (negative values = BC)": "year"}, errors="raise") + tb_n2o = tb_n2o.rename(columns={"Year (negative values = BC)": "year"}, errors="raise") + + # Remove row that contains just the text "Direct measurements". + tb_co2 = tb_co2[tb_co2["year"] != "Direct measurements"].reset_index(drop=True) + tb_ch4 = tb_ch4[tb_ch4["year"] != "Direct measurements"].reset_index(drop=True) + tb_n2o = tb_n2o[tb_n2o["year"] != "Direct measurements"].reset_index(drop=True) + + # Convert year column to a float. + tb_co2["year"] = tb_co2["year"].astype(float) + tb_ch4["year"] = tb_ch4["year"].astype(float) + tb_n2o["year"] = tb_n2o["year"].astype(float) + + # Set an appropriate name to each table. + tb_co2.metadata.short_name = "co2_concentration" + tb_ch4.metadata.short_name = "ch4_concentration" + tb_n2o.metadata.short_name = "n2o_concentration" + + # Remove spurious empty row (with a repeated year 1988) in co2 concentration. + tb_co2 = tb_co2.dropna(subset=[column for column in tb_co2.columns if column != "year"], how="all").reset_index( + drop=True + ) + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb_co2 = tb_co2.underscore().set_index(["year"], verify_integrity=True).sort_index() + tb_ch4 = tb_ch4.underscore().set_index(["year"], verify_integrity=True).sort_index() + tb_n2o = tb_n2o.underscore().set_index(["year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset( + dest_dir, tables=[tb_co2, tb_ch4, tb_n2o], check_variables_metadata=True, default_metadata=snap_co2.metadata + ) + ds_meadow.save() diff --git a/etl/steps/data/meadow/epa/2024-04-17/ice_sheet_mass_balance.py b/etl/steps/data/meadow/epa/2024-04-17/ice_sheet_mass_balance.py new file mode 100644 index 00000000000..34eafb7f4a4 --- /dev/null +++ b/etl/steps/data/meadow/epa/2024-04-17/ice_sheet_mass_balance.py @@ -0,0 +1,31 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data from snapshots. + tb = paths.load_snapshot("ice_sheet_mass_balance.csv").read(skiprows=6) + + # + # Process data. + # + # Ensure all columns are snake-case. + tb = tb.underscore() + + # Set an appropriate index and sort conveniently. + # NOTE: There are multiple rows for the same year. This will be fixed in the garden step. + tb = tb.set_index(["year"], verify_integrity=False).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/epa/2024-04-17/mass_balance_us_glaciers.py b/etl/steps/data/meadow/epa/2024-04-17/mass_balance_us_glaciers.py new file mode 100644 index 00000000000..1d25e1272bc --- /dev/null +++ b/etl/steps/data/meadow/epa/2024-04-17/mass_balance_us_glaciers.py @@ -0,0 +1,27 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data from snapshots. + tb = paths.load_snapshot("mass_balance_us_glaciers.csv").read(skiprows=6) + + # + # Process data. + # + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["Year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/etl/steps/data/meadow/epa/2024-04-17/ocean_heat_content.py b/etl/steps/data/meadow/epa/2024-04-17/ocean_heat_content.py new file mode 100644 index 00000000000..18523609037 --- /dev/null +++ b/etl/steps/data/meadow/epa/2024-04-17/ocean_heat_content.py @@ -0,0 +1,63 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Names of snapshot files to load and process. +FILES = [ + "ocean_heat_content_annual_world_700m", + "ocean_heat_content_annual_world_2000m", +] + +# Columns to select from data, and how to rename them. +COLUMNS_OCEAN_HEAT = { + "Year": "year", + # Data available for both 700m and 2000m. + "IAP": "ocean_heat_content_iap", + "MRI/JMA": "ocean_heat_content_mri", + "NOAA": "ocean_heat_content_noaa", + # Only available for 700m. + "CSIRO": "ocean_heat_content_csiro", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load data from snapshots. + tables = [] + for file_name in FILES: + # Extract depth and location from file name. + depth = int(file_name.split("_")[-1].replace("m", "")) + location = file_name.split("_")[-2].title() + # Read data, select and rename columns. + new_table = ( + paths.load_snapshot(f"{file_name}.csv") + .read(skiprows=6, encoding_errors="ignore") + .rename(columns=COLUMNS_OCEAN_HEAT, errors="ignore") + ) + # Add columns for location and depth. + new_table = new_table.assign(**{"depth": depth, "location": location}) + # Add annual table to list. + tables.append(new_table) + + # + # Process data. + # + # Combine data. + tb = pr.concat(tables, short_name=paths.short_name) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "depth", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_meadow.save() diff --git a/snapshots/climate/2024-04-17/ch4_concentration_monthly.csv.dvc b/snapshots/climate/2024-04-17/ch4_concentration_monthly.csv.dvc new file mode 100644 index 00000000000..558c6f672b5 --- /dev/null +++ b/snapshots/climate/2024-04-17/ch4_concentration_monthly.csv.dvc @@ -0,0 +1,23 @@ +meta: + origin: + producer: NOAA Global Monitoring Laboratory + title: Trends in Atmospheric Methane + description: |- + The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution. + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Methane. + + Lan, X., K.W. Thoning, and E.J. Dlugokencky: Trends in globally-averaged CH4, N2O, and SF6 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/P8XG-AA10 + attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Methane (2024) + attribution_short: NOAA/GML + url_main: https://gml.noaa.gov/ccgg/trends_ch4/ + url_download: https://gml.noaa.gov/webdata/ccgg/trends/ch4/ch4_mm_gl.csv + date_accessed: '2024-04-17' + date_published: '2024-04-05' + license: + name: CC BY 4.0 + url: https://gml.noaa.gov/about/disclaimer.html +outs: + - md5: ca470fcb8b5e2b37b128230f6530422f + size: 22727 + path: ch4_concentration_monthly.csv diff --git a/snapshots/climate/2024-04-17/climate_change_impacts.py b/snapshots/climate/2024-04-17/climate_change_impacts.py new file mode 100644 index 00000000000..61063ec48b8 --- /dev/null +++ b/snapshots/climate/2024-04-17/climate_change_impacts.py @@ -0,0 +1,202 @@ +"""Script to create a snapshot for each of the climate change datasets that have regular updates. + +The publication date will be automatically extracted from the source website, if possible, and otherwise it will be +assumed to be the same as the access date. These dates will be written to the metadata dvc files. + +""" + +import re +from datetime import datetime +from pathlib import Path +from typing import Optional + +import click +import requests +from bs4 import BeautifulSoup +from structlog import get_logger + +from etl.snapshot import Snapshot + +log = get_logger() + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of data files. +FILES = [ + # NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis. + # NOTE: Publication date cannot be automatically extracted. + "surface_temperature_analysis_world.csv", + "surface_temperature_analysis_northern_hemisphere.csv", + "surface_temperature_analysis_southern_hemisphere.csv", + # National Snow and Ice Data Center - Sea Ice Index. + "sea_ice_index.xlsx", + # Met Office Hadley Centre - HadSST. + "sea_surface_temperature_world.csv", + "sea_surface_temperature_northern_hemisphere.csv", + "sea_surface_temperature_southern_hemisphere.csv", + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # NOTE: Publication date cannot be automatically extracted. + "ocean_heat_content_monthly_world_700m.csv", + "ocean_heat_content_monthly_world_2000m.csv", + "ocean_heat_content_annual_world_700m.csv", + "ocean_heat_content_annual_world_2000m.csv", + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series. + "hawaii_ocean_time_series.csv", + # Rutgers University Global Snow Lab - Snow Cover Extent. + # NOTE: Publication date cannot be automatically extracted. But they seem to have regular updates (even daily). + "snow_cover_extent_north_america.csv", + "snow_cover_extent_northern_hemisphere.csv", + # NOAA Global Monitoring Laboratory. + "co2_concentration_monthly.csv", + "ch4_concentration_monthly.csv", + "n2o_concentration_monthly.csv", +] + +######################################################################################################################## +# Other possible datasets to include: +# * Ocean heat content data from MRI/JMA. We have this data as part of the EPA ocean heat content compilation. +# But in the following link, they claim the data is updated every year, so it could be added to our yearly data. +# https://www.data.jma.go.jp/gmd/kaiyou/english/ohc/ohc_global_en.html +# * Rutgers University Global Snow Lab also includes snow cover extent for: +# * Eurasia: https://climate.rutgers.edu/snowcover/files/moncov.eurasia.txt +# * North America (excluding Greenland): https://climate.rutgers.edu/snowcover/files/moncov.nam.txt +# * Ice sheet mass balance from NASA EarthData. This is regularly updated, but to access it one has to manually log in. +# The data can be manually accessed from: +# https://climate.nasa.gov/vital-signs/ice-sheets/ +# By clicking on the HTTP link. This leads to a manual log in page. +# Once logged in, the data is accessible via the following link: +# https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/ANTARCTICA_MASS_TELLUS_MASCON_CRI_TIME_SERIES_RL06.1_V3/antarctica_mass_200204_202310.txt +# So, one could use this link, trying with different dates (e.g. ..._202401.txt, ..._202312.txt, ..._202311.txt), +# until the most recent file is downloaded. +# I contacted EarthData to ask if there is any way to access the latest data programmatically. +# * Global sea level from NASA. +# We could get more up-to-date data on sea levels from https://sealevel.jpl.nasa.gov/ +# but we would need to use a special library with credentials to fetch the data (and the baseline and format would +# probably be different). +######################################################################################################################## + + +def find_date_published(snap: Snapshot) -> Optional[str]: + # Extract publication date for each individual origin, if possible. + # Otherwise, assign the current access date as publication date. + if snap.path.name == "sea_ice_index.xlsx": + # * For sea_ice_index, the date_published can be found on: + # https://noaadata.apps.nsidc.org/NOAA/G02135/seaice_analysis/ + # Next to the file name (Sea_Ice_Index_Monthly_Data_by_Year_G02135_v3.0.xlsx). + + # Extract all the text in the web page. + url = "/".join(snap.metadata.origin.url_download.split("/")[:-1]) # type: ignore + response = requests.get(url) + # Parse HTML content. + soup = BeautifulSoup(response.text, "html.parser") + + # Fetch the date that is written next to the title. + for line in soup.text.split("\n"): + if "Sea_Ice_Index_Monthly_Data_by_Year" in line: + dates = re.findall(r"\d{2}-\w{3}-\d{4}", line) + if len(dates) == 1: + # Format date conveniently. + date = datetime.strptime(dates[0], "%d-%b-%Y").strftime("%Y-%m-%d") + return date + else: + log.warn(f"Failed to extract date_published for: {snap.path.name}") + + elif snap.path.name.startswith("sea_surface_temperature_"): + # * For sea_surface_temperature_* the date_published can be found on: + # https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html + + # Extract all the text in the web page. + url = snap.metadata.origin.url_download.split("/data/")[0] + "/data/download.html" # type: ignore + response = requests.get(url) + # Parse HTML content. + soup = BeautifulSoup(response.text, "html.parser") + + for line in soup.text.split("\n"): + # At the bottom of the page, there is a line like "Last updated: 09/01/2024 Expires: 09/01/2025". + if "Last updated" in line: + dates = re.findall(r"\d{2}/\d{2}/\d{4}", line) + if len(dates) == 2: + # Format date conveniently. + date = datetime.strptime(dates[0], "%d/%m/%Y").strftime("%Y-%m-%d") + return date + else: + log.warn(f"Failed to extract date_published for: {snap.path.name}") + + elif snap.path.name == "hawaii_ocean_time_series.csv": + # * For the Hawaii Ocean Time-Series, the date_published can be found written on the header of the data itself: + # https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2.txt + + # Extract text from data file. + url = snap.metadata.origin.url_download # type: ignore + response = requests.get(url) + for line in response.text.split("\n"): + # At the top of the file, there is a line like "Last updated 11 December 2023 by J.E. Dore". + if "Last updated" in line: + # Regular expression to extract the date + dates = re.findall(r"\d{1,2}\s+\w+\s+\d{4}", line) + if len(dates) == 1: + # Format date conveniently. + date = datetime.strptime(dates[0], "%d %B %Y").strftime("%Y-%m-%d") + return date + else: + log.warn(f"Failed to extract date_published for: {snap.path.name}") + + elif "_concentration" in snap.path.name: + # * For NOAA GML concentration data, the date_published can be found in the header of each data file. + # The date is in a line like "# File Creation: Fri Jan 5 03:55:24 2024". + + # Extract text from data file. + url = snap.metadata.origin.url_download # type: ignore + response = requests.get(url) + for line in response.text.split("\n"): + # At the top of the file, there is a line like "Last updated 11 December 2023 by J.E. Dore". + if "File Creation" in line: + # Regular expression to extract the date + dates = re.findall(r"\w{3}\s\w{3}\s+\d{1,2}\s\d{2}:\d{2}:\d{2}\s\d{4}", line) + if len(dates) == 1: + # Format date conveniently. + date = datetime.strptime(dates[0], "%a %b %d %H:%M:%S %Y").strftime("%Y-%m-%d") + return date + else: + log.warn(f"Failed to extract date_published for: {snap.path.name}") + + # In all other cases, assume date_published is the same as date_accessed. + return snap.metadata.origin.date_accessed # type: ignore + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot metadata dvc files for each of the data files. + for file_name in FILES: + snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/{file_name}") + + # To ease the recurrent task update, fetch the access date from the version, and write it to the dvc files. + snap.metadata.origin.date_accessed = SNAPSHOT_VERSION # type: ignore + + # Extract publication date, if possible, and otherwise assume it is the same as the access date. + snap.metadata.origin.date_published = find_date_published(snap=snap) # type: ignore + + # Extract publication year from date_published (which will be used in the custom attribution). + year_published = snap.metadata.origin.date_published.split("-")[0] # type: ignore + + # Assign a custom attribution. + snap.metadata.origin.attribution = ( # type: ignore + f"{snap.metadata.origin.producer} - {snap.metadata.origin.title} ({year_published})" # type: ignore + ) + + # Rewrite metadata to dvc file. + snap.metadata_path.write_text(snap.metadata.to_yaml()) + + # Create the actual snapshots, download the data and upload them to S3. + # NOTE: This cannot be done as part of the previous loop because, if the folder of dvc files has been manually + # duplicated (without manually removing the "outs" section), `create_snapshot` will fail because there are multiple + # files with the same "outs". Therefore, we first clean the dvc files, and then run `create_snapshot`. + for file_name in FILES: + snap = Snapshot(f"climate/{SNAPSHOT_VERSION}/{file_name}") + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/climate/2024-04-17/co2_concentration_monthly.csv.dvc b/snapshots/climate/2024-04-17/co2_concentration_monthly.csv.dvc new file mode 100644 index 00000000000..bf751f6a143 --- /dev/null +++ b/snapshots/climate/2024-04-17/co2_concentration_monthly.csv.dvc @@ -0,0 +1,23 @@ +meta: + origin: + producer: NOAA Global Monitoring Laboratory + title: Trends in Atmospheric Carbon Dioxide + description: |- + The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution. + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Carbon Dioxide. + + Lan, X., Tans, P. and K.W. Thoning: Trends in globally-averaged CO2 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/9N0H-ZH07 + attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Carbon Dioxide (2024) + attribution_short: NOAA/GML + url_main: https://gml.noaa.gov/ccgg/trends/gl_data.html + url_download: https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_mm_gl.csv + date_accessed: '2024-04-17' + date_published: '2024-04-05' + license: + name: CC BY 4.0 + url: https://gml.noaa.gov/about/disclaimer.html +outs: + - md5: 199c8b9b8760b4330ff337b3e167bc86 + size: 23557 + path: co2_concentration_monthly.csv diff --git a/snapshots/climate/2024-04-17/hawaii_ocean_time_series.csv.dvc b/snapshots/climate/2024-04-17/hawaii_ocean_time_series.csv.dvc new file mode 100644 index 00000000000..28f5a5def38 --- /dev/null +++ b/snapshots/climate/2024-04-17/hawaii_ocean_time_series.csv.dvc @@ -0,0 +1,25 @@ +meta: + origin: + producer: School of Ocean & Earth Science & Technology + title: Hawaii Ocean Time-series + citation_full: |- + School of Ocean and Earth Science and Technology at the University of Hawai'i at Manoa - Hawaii Ocean Time-series (HOT). + + Dore, J.E., R. Lukas, D.W. Sadler, M.J. Church, and D.M. Karl. 2009. Physical and biogeochemical modulation of ocean acidification in the central North Pacific. Proc Natl Acad Sci USA 106:12235-12240. + + HOT observations are supported by the U.S. National Science Foundation under Award #1756517. + + More details can be found at [the HOT Carbon Dioxide page](https://hahana.soest.hawaii.edu/hot/hotco2/hotco2.html), specifically in [this technical document](https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2_readme.pdf). + attribution: School of Ocean & Earth Science & Technology - Hawaii Ocean Time-series (2023) + attribution_short: SOEST/Hawaii + url_main: https://hahana.soest.hawaii.edu/hot/ + url_download: https://hahana.soest.hawaii.edu/hot/hotco2/HOT_surface_CO2.txt + date_accessed: '2024-04-17' + date_published: '2023-12-11' + license: + name: Public domain + url: https://hahana.soest.hawaii.edu/hot/dataaccess.html +outs: + - md5: fd502d28aa85a6f241e9507d85b8ca8b + size: 44820 + path: hawaii_ocean_time_series.csv diff --git a/snapshots/climate/2024-04-17/n2o_concentration_monthly.csv.dvc b/snapshots/climate/2024-04-17/n2o_concentration_monthly.csv.dvc new file mode 100644 index 00000000000..2babffd8562 --- /dev/null +++ b/snapshots/climate/2024-04-17/n2o_concentration_monthly.csv.dvc @@ -0,0 +1,23 @@ +meta: + origin: + producer: NOAA Global Monitoring Laboratory + title: Trends in Atmospheric Nitrous Oxide + description: |- + The Carbon Cycle Greenhouse Gases (CCGG) research area operates the Global Greenhouse Gas Reference Network, measuring the atmospheric distribution and trends of the three main long-term drivers of climate change, carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), as well as carbon monoxide (CO) which is an important indicator of air pollution. + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) Global Monitoring Laboratory, Boulder, Colorado, USA (https://gml.noaa.gov) - Trends in Atmospheric Nitrous Oxide. + + Lan, X., K.W. Thoning, and E.J. Dlugokencky: Trends in globally-averaged CH4, N2O, and SF6 determined from NOAA Global Monitoring Laboratory measurements. https://doi.org/10.15138/P8XG-AA10 + attribution: NOAA Global Monitoring Laboratory - Trends in Atmospheric Nitrous Oxide (2024) + attribution_short: NOAA/GML + url_main: https://gml.noaa.gov/ccgg/trends_n2o/ + url_download: https://gml.noaa.gov/webdata/ccgg/trends/n2o/n2o_mm_gl.csv + date_accessed: '2024-04-17' + date_published: '2024-04-05' + license: + name: CC BY 4.0 + url: https://gml.noaa.gov/about/disclaimer.html +outs: + - md5: f4700ba6a658afc72b462ecdf81eb3b0 + size: 13301 + path: n2o_concentration_monthly.csv diff --git a/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc b/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc new file mode 100644 index 00000000000..df9d773a1df --- /dev/null +++ b/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc @@ -0,0 +1,27 @@ +meta: + origin: + producer: NOAA National Centers for Environmental Information + title: Heat Content Basin Time Series + description: |- + The time series of yearly heat content are presented for the 0-700 and 0-2000 meters layers. + + The yearly data for each of four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data.html). + title_snapshot: Heat Content Basin Time Series - World 0 to 2000 meters + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series. + + Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp. + attribution: NOAA National Centers for Environmental Information - Heat Content Basin Time Series (2024) + attribution_short: NOAA/NCEI + url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level + url_download: https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/yearly/h22-w0-2000m.dat + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 + url: |- + https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf +outs: + - md5: af13e73414f3cde4a2326156cf385d35 + size: 1140 + path: ocean_heat_content_annual_world_2000m.csv diff --git a/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc b/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc new file mode 100644 index 00000000000..89399d402fa --- /dev/null +++ b/snapshots/climate/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc @@ -0,0 +1,27 @@ +meta: + origin: + producer: NOAA National Centers for Environmental Information + title: Heat Content Basin Time Series + description: |- + The time series of yearly heat content are presented for the 0-700 and 0-2000 meters layers. + + The yearly data for each of four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data.html). + title_snapshot: Heat Content Basin Time Series - World 0 to 700 meters + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series. + + Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp. + attribution: NOAA National Centers for Environmental Information - Heat Content Basin Time Series (2024) + attribution_short: NOAA/NCEI + url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level + url_download: https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/yearly/h22-w0-700m.dat + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 + url: |- + https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf +outs: + - md5: ef1fff5b0e82b86383acb1e455ea00e5 + size: 3990 + path: ocean_heat_content_annual_world_700m.csv diff --git a/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv.dvc b/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv.dvc new file mode 100644 index 00000000000..3c9c86c0992 --- /dev/null +++ b/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: NOAA National Centers for Environmental Information + title: Heat Content Monthly Basin Time Series + description: |- + The time series of monthly heat content are presented for the 0-700 and 0-2000 meters layers. + + The monthly data for each of the four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data_monthly.html). + title_snapshot: Heat Content Monthly Basin Time Series - World 0 to 2000 meters + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series. + + Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp. + attribution: NOAA National Centers for Environmental Information - Heat Content Monthly Basin Time Series (2024) + attribution_short: NOAA/NCEI + url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level + url_download: |- + https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/onemonth/ohc2000m_levitus_climdash_monthly.csv + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 + url: |- + https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf +outs: + - md5: 213b26eae75df619c734b0b65db8a105 + size: 3929 + path: ocean_heat_content_monthly_world_2000m.csv diff --git a/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv.dvc b/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv.dvc new file mode 100644 index 00000000000..4c05da77f2c --- /dev/null +++ b/snapshots/climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: NOAA National Centers for Environmental Information + title: Heat Content Monthly Basin Time Series + description: |- + The time series of monthly heat content are presented for the 0-700 and 0-2000 meters layers. + + The monthly data for each of the four major oceanic basins, namely the World Ocean, the Pacific Ocean, the Atlantic Ocean (which includes the entire Arctic Ocean), and the Indian Ocean, can be accessed on [this page](https://www.ncei.noaa.gov/access/global-ocean-heat-content/basin_heat_data_monthly.html). + title_snapshot: Heat Content Monthly Basin Time Series - World 0 to 700 meters + citation_full: |- + National Oceanic and Atmospheric Administration (NOAA) National Centers for Environmental Information (NCEI) - Heat Content Monthly Basin Time Series. + + Levitus, Sydney; Antonov, John I.; Boyer, Tim P.; Baranova, Olga K.; García, Hernán E.; Locarnini, Ricardo A.; Mishonov, Alexey V.; Reagan, James R.; Seidov, Dan; Yarosh, Evgeney; Zweng, Melissa M. (2017). NCEI ocean heat content, temperature anomalies, salinity anomalies, thermosteric sea level anomalies, halosteric sea level anomalies, and total steric sea level anomalies from 1955 to present calculated from in situ oceanographic subsurface profile data (NCEI Accession 0164586). https://doi.org/10.7289/v53f4mvp. + attribution: NOAA National Centers for Environmental Information - Heat Content Monthly Basin Time Series (2024) + attribution_short: NOAA/NCEI + url_main: https://www.ncei.noaa.gov/products/ocean-heat-salt-sea-level + url_download: |- + https://www.ncei.noaa.gov/data/oceans/woa/DATA_ANALYSIS/3M_HEAT_CONTENT/DATA/basin/onemonth/ohc_levitus_climdash_monthly.csv + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 + url: |- + https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2023-12/NCEI%20PD-10-2-02%20-%20Open%20Data%20Policy%20Signed.pdf +outs: + - md5: 408affa84dbb88e83601a0f598314092 + size: 3882 + path: ocean_heat_content_monthly_world_700m.csv diff --git a/snapshots/climate/2024-04-17/sea_ice_index.xlsx.dvc b/snapshots/climate/2024-04-17/sea_ice_index.xlsx.dvc new file mode 100644 index 00000000000..8504ef7648b --- /dev/null +++ b/snapshots/climate/2024-04-17/sea_ice_index.xlsx.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: National Snow and Ice Data Center + title: Sea Ice Index + citation_full: |- + Fetterer, F., K. Knowles, W. N. Meier, M. Savoie, and A. K. Windnagel. (2017). Sea Ice Index, Version 3 [Data Set]. Boulder, Colorado USA. National Snow and Ice Data Center. https://doi.org/10.7265/N5K072F8. + attribution: National Snow and Ice Data Center - Sea Ice Index (2024) + attribution_short: NSIDC + version_producer: Version 3 + url_main: https://nsidc.org/data/g02135/ + url_download: https://noaadata.apps.nsidc.org/NOAA/G02135/seaice_analysis/Sea_Ice_Index_Monthly_Data_by_Year_G02135_v3.0.xlsx + date_accessed: '2024-04-17' + date_published: '2024-04-16' + license: + name: CC BY 4.0 +outs: + - md5: 7789e0f09255f745fa15e726569287fc + size: 25191 + path: sea_ice_index.xlsx diff --git a/snapshots/climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv.dvc new file mode 100644 index 00000000000..c358137828e --- /dev/null +++ b/snapshots/climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv.dvc @@ -0,0 +1,26 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: Hadley Centre's Sea Surface Temperature (HadSST) + title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - Northern hemisphere + citation_full: |- + Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST). + + Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R. + E. (2019). An ensemble data set of sea-surface temperature change from 1850: + the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical + Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867 + attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024) + attribution_short: Met Office + version_producer: 4.0.1.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/ + url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_NHEM.csv + date_accessed: '2024-04-17' + date_published: '2024-02-19' + license: + name: Open Government Licence v3 + url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html +outs: + - md5: bebd99188dcd69a62a9264cde07274fc + size: 152748 + path: sea_surface_temperature_northern_hemisphere.csv diff --git a/snapshots/climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv.dvc new file mode 100644 index 00000000000..a72ec7d2896 --- /dev/null +++ b/snapshots/climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv.dvc @@ -0,0 +1,26 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: Hadley Centre's Sea Surface Temperature (HadSST) + title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - Southern hemisphere + citation_full: |- + Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST). + + Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R. + E. (2019). An ensemble data set of sea-surface temperature change from 1850: + the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical + Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867 + attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024) + attribution_short: Met Office + version_producer: 4.0.1.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/ + url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_SHEM.csv + date_accessed: '2024-04-17' + date_published: '2024-02-19' + license: + name: Open Government Licence v3 + url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html +outs: + - md5: fdddc1523485ea4688c1c38fe067353b + size: 153344 + path: sea_surface_temperature_southern_hemisphere.csv diff --git a/snapshots/climate/2024-04-17/sea_surface_temperature_world.csv.dvc b/snapshots/climate/2024-04-17/sea_surface_temperature_world.csv.dvc new file mode 100644 index 00000000000..1e91922d6d9 --- /dev/null +++ b/snapshots/climate/2024-04-17/sea_surface_temperature_world.csv.dvc @@ -0,0 +1,26 @@ +meta: + origin: + producer: Met Office Hadley Centre + title: Hadley Centre's Sea Surface Temperature (HadSST) + title_snapshot: Hadley Centre's Sea Surface Temperature (HadSST) - World + citation_full: |- + Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST). + + Kennedy, J. J., Rayner, N. A., Atkinson, C. P., & Killick, R. + E. (2019). An ensemble data set of sea-surface temperature change from 1850: + the Met Office Hadley Centre HadSST.4.0.0.0 data set. Journal of Geophysical + Research: Atmospheres, 124. https://doi.org/10.1029/2018JD029867 + attribution: Met Office Hadley Centre - Hadley Centre's Sea Surface Temperature (HadSST) (2024) + attribution_short: Met Office + version_producer: 4.0.1.0 + url_main: https://www.metoffice.gov.uk/hadobs/hadsst4/ + url_download: https://www.metoffice.gov.uk/hadobs/hadsst4/data/csv/HadSST.4.0.1.0_monthly_GLOBE.csv + date_accessed: '2024-04-17' + date_published: '2024-02-19' + license: + name: Open Government Licence v3 + url: https://www.metoffice.gov.uk/hadobs/hadsst4/data/download.html +outs: + - md5: 8d5d830f7b14ec99e377527e60d408fd + size: 153237 + path: sea_surface_temperature_world.csv diff --git a/snapshots/climate/2024-04-17/snow_cover_extent_north_america.csv.dvc b/snapshots/climate/2024-04-17/snow_cover_extent_north_america.csv.dvc new file mode 100644 index 00000000000..76888e4302d --- /dev/null +++ b/snapshots/climate/2024-04-17/snow_cover_extent_north_america.csv.dvc @@ -0,0 +1,22 @@ +meta: + origin: + producer: Rutgers University Global Snow Lab + title: Snow Cover Extent + title_snapshot: Area of Snow Extent - North America (including Greenland) + citation_full: |- + Rutgers University Global Snow Lab - Area of Snow Extent. + + Robinson, David A., Estilow, Thomas W., and NOAA CDR Program (2012): NOAA Climate Data Record (CDR) of Northern Hemisphere (NH) Snow Cover Extent (SCE), Version 1. NOAA National Centers for Environmental Information. doi: 10.7289/V5N014G9 + attribution: Rutgers University Global Snow Lab - Snow Cover Extent (2024) + attribution_short: Rutgers + version_producer: Version 1 + url_main: https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0 + url_download: https://climate.rutgers.edu/snowcover/files/moncov.namgnld.txt + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 +outs: + - md5: b9a1c33040d06e34e40a50f560955f1c + size: 12599 + path: snow_cover_extent_north_america.csv diff --git a/snapshots/climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv.dvc new file mode 100644 index 00000000000..2a2eb18fe23 --- /dev/null +++ b/snapshots/climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv.dvc @@ -0,0 +1,22 @@ +meta: + origin: + producer: Rutgers University Global Snow Lab + title: Snow Cover Extent + title_snapshot: Area of Snow Extent - Northern Hemisphere + citation_full: |- + Rutgers University Global Snow Lab - Area of Snow Extent. + + Robinson, David A., Estilow, Thomas W., and NOAA CDR Program (2012): NOAA Climate Data Record (CDR) of Northern Hemisphere (NH) Snow Cover Extent (SCE), Version 1. NOAA National Centers for Environmental Information. doi: 10.7289/V5N014G9 + attribution: Rutgers University Global Snow Lab - Snow Cover Extent (2024) + attribution_short: Rutgers + version_producer: Version 1 + url_main: https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0 + url_download: https://climate.rutgers.edu/snowcover/files/moncov.nhland.txt + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 +outs: + - md5: f28936a4230c681bb9dbfad61d7f50a4 + size: 12721 + path: snow_cover_extent_northern_hemisphere.csv diff --git a/snapshots/climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv.dvc new file mode 100644 index 00000000000..aaa334aaf07 --- /dev/null +++ b/snapshots/climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: NASA Goddard Institute for Space Studies + title: GISS Surface Temperature Analysis + title_snapshot: GISS Surface Temperature Analysis - Northern hemisphere + citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4) + attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024) + attribution_short: NASA + version_producer: v4 + url_main: https://data.giss.nasa.gov/gistemp/ + url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/NH.Ts+dSST.csv + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 +outs: + - md5: 85e695cd4d096248d04dcbfcff02f1a3 + size: 12731 + path: surface_temperature_analysis_northern_hemisphere.csv diff --git a/snapshots/climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv.dvc b/snapshots/climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv.dvc new file mode 100644 index 00000000000..ff3335c7178 --- /dev/null +++ b/snapshots/climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: NASA Goddard Institute for Space Studies + title: GISS Surface Temperature Analysis + title_snapshot: GISS Surface Temperature Analysis - Southern hemisphere + citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4) + attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024) + attribution_short: NASA + version_producer: v4 + url_main: https://data.giss.nasa.gov/gistemp/ + url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/SH.Ts+dSST.csv + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 +outs: + - md5: 0e3bc94e41096b0dcd8aee8b8cd59eed + size: 12711 + path: surface_temperature_analysis_southern_hemisphere.csv diff --git a/snapshots/climate/2024-04-17/surface_temperature_analysis_world.csv.dvc b/snapshots/climate/2024-04-17/surface_temperature_analysis_world.csv.dvc new file mode 100644 index 00000000000..5da8d0b0614 --- /dev/null +++ b/snapshots/climate/2024-04-17/surface_temperature_analysis_world.csv.dvc @@ -0,0 +1,19 @@ +meta: + origin: + producer: NASA Goddard Institute for Space Studies + title: GISS Surface Temperature Analysis + title_snapshot: GISS Surface Temperature Analysis - World + citation_full: NASA. GISS Surface Temperature Analysis (GISTEMP v4) + attribution: NASA Goddard Institute for Space Studies - GISS Surface Temperature Analysis (2024) + attribution_short: NASA + version_producer: v4 + url_main: https://data.giss.nasa.gov/gistemp/ + url_download: https://data.giss.nasa.gov/gistemp/tabledata_v4/GLB.Ts+dSST.csv + date_accessed: '2024-04-17' + date_published: '2024-04-17' + license: + name: CC BY 4.0 +outs: + - md5: 8b4ac958dda08f468f580a796dbe5b1c + size: 12690 + path: surface_temperature_analysis_world.csv diff --git a/snapshots/epa/2024-04-17/ch4_concentration.csv.dvc b/snapshots/epa/2024-04-17/ch4_concentration.csv.dvc new file mode 100644 index 00000000000..29dfa5c99d4 --- /dev/null +++ b/snapshots/epa/2024-04-17/ch4_concentration.csv.dvc @@ -0,0 +1,29 @@ +meta: + origin: + producer: United States Environmental Protection Agency + title: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases' + description: |- + This indicator describes how the levels of major greenhouse gases in the atmosphere have changed over time. + + The data contains concentrations of carbon dioxide in the atmosphere from hundreds of thousands of years ago through 2021, measured in parts per million (ppm). The data come from a variety of historical ice core studies and recent air monitoring sites around the world. + title_snapshot: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases - Methane' + citation_full: |- + United States Environmental Protection Agency (EPA) - Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases (2022) + + Global atmospheric concentration measurements for carbon dioxide, methane, and nitrous oxide come from a variety of monitoring programs and studies published in peer-reviewed literature. + + More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-03/documents/ghg-concentrations_td.pdf). + attribution: EPA based on various sources (2022) + attribution_short: EPA + url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-atmospheric-concentrations-greenhouse-gases + url_download: https://www.epa.gov/system/files/other-files/2022-07/ghg-concentrations_fig-2.csv + date_accessed: '2024-04-17' + date_published: '2022-07-01' + license: + name: Public domain + url: https://edg.epa.gov/epa_data_license.html +wdir: ../../../data/snapshots/epa/2024-04-17 +outs: + - md5: e960453c271261622b4bfe2b2131755e + size: 48396 + path: ch4_concentration.csv diff --git a/snapshots/epa/2024-04-17/climate_change_indicators.py b/snapshots/epa/2024-04-17/climate_change_indicators.py new file mode 100644 index 00000000000..6943a206f28 --- /dev/null +++ b/snapshots/epa/2024-04-17/climate_change_indicators.py @@ -0,0 +1,43 @@ +"""Script to create snapshots of EPA compilations of different climate change indicators. + +The main page is https://www.epa.gov/climate-indicators/view-indicators +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of data files. +FILES = [ + # Ocean heat content. + "ocean_heat_content_annual_world_700m.csv", + "ocean_heat_content_annual_world_2000m.csv", + # Ice sheet mass balance. + "ice_sheet_mass_balance.csv", + # Greenhouse gas concentration. + "co2_concentration.csv", + "ch4_concentration.csv", + "n2o_concentration.csv", + # Cumulative mass balance of US glaciers. + "mass_balance_us_glaciers.csv", +] + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot for each of the data files. + for file_name in FILES: + snap = Snapshot(f"epa/{SNAPSHOT_VERSION}/{file_name}") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/epa/2024-04-17/co2_concentration.csv.dvc b/snapshots/epa/2024-04-17/co2_concentration.csv.dvc new file mode 100644 index 00000000000..be6edad58e2 --- /dev/null +++ b/snapshots/epa/2024-04-17/co2_concentration.csv.dvc @@ -0,0 +1,29 @@ +meta: + origin: + producer: United States Environmental Protection Agency + title: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases' + description: |- + This indicator describes how the levels of major greenhouse gases in the atmosphere have changed over time. + + The data contains concentrations of carbon dioxide in the atmosphere from hundreds of thousands of years ago through 2021, measured in parts per million (ppm). The data come from a variety of historical ice core studies and recent air monitoring sites around the world. + title_snapshot: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases - Carbon Dioxide' + citation_full: |- + United States Environmental Protection Agency (EPA) - Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases (2022) + + Global atmospheric concentration measurements for carbon dioxide, methane, and nitrous oxide come from a variety of monitoring programs and studies published in peer-reviewed literature. + + More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-03/documents/ghg-concentrations_td.pdf). + attribution: EPA based on various sources (2022) + attribution_short: EPA + url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-atmospheric-concentrations-greenhouse-gases + url_download: https://www.epa.gov/system/files/other-files/2022-07/ghg-concentrations_fig-1.csv + date_accessed: '2024-04-17' + date_published: '2022-07-01' + license: + name: Public domain + url: https://edg.epa.gov/epa_data_license.html +wdir: ../../../data/snapshots/epa/2024-04-17 +outs: + - md5: c189619be8fc1b41144235271091309f + size: 59458 + path: co2_concentration.csv diff --git a/snapshots/epa/2024-04-17/ice_sheet_mass_balance.csv.dvc b/snapshots/epa/2024-04-17/ice_sheet_mass_balance.csv.dvc new file mode 100644 index 00000000000..2cf75808699 --- /dev/null +++ b/snapshots/epa/2024-04-17/ice_sheet_mass_balance.csv.dvc @@ -0,0 +1,37 @@ +meta: + origin: + producer: United States Environmental Protection Agency + title: 'Climate Change Indicators: Ice Sheets' + description: |- + This dataset includes the cumulative change in mass in the ice sheets of Greenland and Antarctica since 1992. + + This indicator is presented in two different ways, based on two different sources: + * IMBIE, an average value based on more than 20 different studies. + * The uncertainty estimates from the various datasets that feed into the combined average. + * NASA JPL: A commonly cited analysis by NASA JPL, which includes seasonal variations. + + All estimates are centered at zero in 2002 to provide a consistent point of reference. Thus, a downward slope indicates a net loss of ice and snow. + For reference, 1,000 billion metric tons is equal to about 260 cubic miles of ice—enough to raise sea level by about 3 millimeters. + citation_full: |- + United States Environmental Protection Agency (EPA) - Climate Change Indicators: Ice Sheets (2021) + + Full citation of the original sources: + * IMBIE (Ice sheet Mass Balance Inter-comparison Exercise team). 2018. Mass balance of the Antarctic Ice Sheet from 1992 to 2017. Nature 558:219-222. doi:10.1038/s41586-018-0179-y + * IMBIE (Ice sheet Mass Balance Inter-comparison Exercise team). 2020. Mass balance of the Greenland Ice sheet from 1992 to 2018. Nature in press. doi:10.1038/s41586-019-1855-2 + * NASA (National Aeronautics and Space Administration). 2021. Vital signs: Land ice. Antarctica and Greenland mass variation since 2002. Accessed March 2021. https://climate.nasa.gov/vital-signs/land-ice + + More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-04/documents/ice-sheets_td.pdf). + attribution: EPA based on various sources (2021) + attribution_short: EPA + url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-ice-sheets + url_download: https://www.epa.gov/sites/default/files/2021-04/ice_sheets_fig-1.csv + date_accessed: '2024-04-17' + date_published: '2021-04-01' + license: + name: Public domain + url: https://edg.epa.gov/epa_data_license.html +wdir: ../../../data/snapshots/epa/2024-04-17 +outs: + - md5: 4855164ffa55ddf71df690fadceac94c + size: 25986 + path: ice_sheet_mass_balance.csv diff --git a/snapshots/epa/2024-04-17/mass_balance_us_glaciers.csv.dvc b/snapshots/epa/2024-04-17/mass_balance_us_glaciers.csv.dvc new file mode 100644 index 00000000000..f8fe91c1617 --- /dev/null +++ b/snapshots/epa/2024-04-17/mass_balance_us_glaciers.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: United States Environmental Protection Agency + title: 'Climate Change Indicators: Glaciers' + description: |- + This dataset examines the balance between snow accumulation and melting in glaciers, and it describes how glaciers have changed over time. + + title_snapshot: 'Climate Change Indicators: Glaciers - US Glaciers' + citation_full: |- + United States Environmental Protection Agency (EPA) - Climate Change Indicators: Glaciers (2021) + + The underlying data comes from the United States Geological Survey (USGS). Glacier-wide mass balance and compiled data inputs: USGS benchmark glaciers (ver. 4.0, November 2019). Accessed December 2020. https://alaska.usgs.gov/products/data.php?dataid=79. doi:10.5066/F7HD7SRF. + + More details can be found on [the EPA's technical documentation](https://www.epa.gov/sites/default/files/2021-03/documents/glaciers_td.pdf). + attribution: EPA based on various sources (2021) + attribution_short: EPA + url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-glaciers + url_download: https://www.epa.gov/sites/default/files/2021-03/glaciers_fig-2.csv + date_accessed: '2024-04-17' + date_published: '2021-04-01' + license: + name: Public domain + url: https://edg.epa.gov/epa_data_license.html +wdir: ../../../data/snapshots/epa/2024-04-17 +outs: + - md5: 98571ba4a0568f2f3ab78bc2aa9c9c3f + size: 2257 + path: mass_balance_us_glaciers.csv diff --git a/snapshots/epa/2024-04-17/n2o_concentration.csv.dvc b/snapshots/epa/2024-04-17/n2o_concentration.csv.dvc new file mode 100644 index 00000000000..b209fbf30ac --- /dev/null +++ b/snapshots/epa/2024-04-17/n2o_concentration.csv.dvc @@ -0,0 +1,29 @@ +meta: + origin: + producer: United States Environmental Protection Agency + title: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases' + description: |- + This indicator describes how the levels of major greenhouse gases in the atmosphere have changed over time. + + The data contains concentrations of carbon dioxide in the atmosphere from hundreds of thousands of years ago through 2021, measured in parts per million (ppm). The data come from a variety of historical ice core studies and recent air monitoring sites around the world. + title_snapshot: 'Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases - Nitrous Oxide' + citation_full: |- + United States Environmental Protection Agency (EPA) - Climate Change Indicators: Atmospheric Concentrations of Greenhouse Gases (2022) + + Global atmospheric concentration measurements for carbon dioxide, methane, and nitrous oxide come from a variety of monitoring programs and studies published in peer-reviewed literature. + + More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-03/documents/ghg-concentrations_td.pdf). + attribution: EPA based on various sources (2022) + attribution_short: EPA + url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-atmospheric-concentrations-greenhouse-gases + url_download: https://www.epa.gov/system/files/other-files/2022-07/ghg-concentrations_fig-3.csv + date_accessed: '2024-04-17' + date_published: '2022-07-01' + license: + name: Public domain + url: https://edg.epa.gov/epa_data_license.html +wdir: ../../../data/snapshots/epa/2024-04-17 +outs: + - md5: b2bcc42a26e9d7fb47bb9c107fe60f66 + size: 19470 + path: n2o_concentration.csv diff --git a/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc b/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc new file mode 100644 index 00000000000..d405db0f89c --- /dev/null +++ b/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv.dvc @@ -0,0 +1,32 @@ +meta: + origin: + producer: United States Environmental Protection Agency + title: 'Climate Change Indicators: Ocean Heat' + title_snapshot: 'Climate Change Indicators: Ocean Heat - World 0 to 2000 meters' + citation_full: |- + United States Environmental Protection Agency (EPA) - Climate Change Indicators: Ocean Heat (2021) + + Data for this indicator were collected by the National Oceanic and Atmospheric Administration (NOAA) and other organizations around the world. The data were analyzed independently by researchers at NOAA, Australia's Commonwealth Scientific and Industrial Research Organisation (CSIRO), China's Institute of Atmospheric Physics (IAP), and the Japan Meteorological Agency's Meteorological Research Institute (MRI/JMA). + + Full citation of the original sources: + * NOAA (National Oceanic and Atmospheric Administration). 2021. Global ocean heat and salt content. Accessed February 2021. www.nodc.noaa.gov/OC5/3M_HEAT_CONTENT . Based on: + * Levitus, S., J.I. Antonov, T.P. Boyer, O.K. Baranova, H.E. Garcia, R.A. Locarnini, A.V. Mishonov, J.R. Reagan, D. Seidov, E.S. Yarosh, and M.M. Zweng. 2012. World ocean heat content and thermosteric sea level change (0-2000 m), 1955-2010. Geophys. Res. Lett. 39:L10603. https://www.ncei.noaa.gov/data/oceans/woa/PUBLICATIONS/grlheat12.pdf + * CSIRO (Commonwealth Scientific and Industrial Research Organisation). 2016 update to data originally published in: Domingues, C.M., J.A. Church, N.J. White, P.J. Gleckler, S.E. Wijffels, P.M. Barker, and J.R. Dunn. 2008. Improved estimates of upper-ocean warming and multi-decadal sea-level rise. Nature 453:1090-1094. www.cmar.csiro.au/sealevel/thermal_expansion_ocean_heat_timeseries.html + * IAP (Institute of Atmospheric Physics). 2021 update to data originally published in: Cheng, L., K.E. Trenberth, J. Fasullo, T. Boyer, J. Abraham, and J. Zhu. 2017. Improved estimates of ocean heat content from 1960 to 2015. Science Advances 3(3):e1601545. + * MRI/JMA (Meteorological Research Institute/Japan Meteorological Agency). 2021. Global ocean heat content. Accessed February 2021. www.data.jma.go.jp/gmd/kaiyou/english/ohc/ohc_global_en.html. + + More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-04/documents/ocean-heat_td.pdf). + attribution: EPA based on various sources (2021) + attribution_short: EPA + url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-ocean-heat + url_download: https://www.epa.gov/sites/default/files/2021-04/ocean-heat_fig-2.csv + date_accessed: '2024-04-17' + date_published: '2021-04-01' + license: + name: Public domain + url: https://edg.epa.gov/epa_data_license.html +wdir: ../../../data/snapshots/epa/2024-04-17 +outs: + - md5: 56d585ef448f394150031a8f335eb373 + size: 3152 + path: ocean_heat_content_annual_world_2000m.csv diff --git a/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc b/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc new file mode 100644 index 00000000000..676bb3dba27 --- /dev/null +++ b/snapshots/epa/2024-04-17/ocean_heat_content_annual_world_700m.csv.dvc @@ -0,0 +1,32 @@ +meta: + origin: + producer: United States Environmental Protection Agency + title: 'Climate Change Indicators: Ocean Heat' + title_snapshot: 'Climate Change Indicators: Ocean Heat - World 0 to 700 meters' + citation_full: |- + United States Environmental Protection Agency (EPA) - Climate Change Indicators: Ocean Heat (2021) + + Data for this indicator were collected by the National Oceanic and Atmospheric Administration (NOAA) and other organizations around the world. The data were analyzed independently by researchers at NOAA, Australia's Commonwealth Scientific and Industrial Research Organisation (CSIRO), China's Institute of Atmospheric Physics (IAP), and the Japan Meteorological Agency's Meteorological Research Institute (MRI/JMA). + + Full citation of the original sources: + * NOAA (National Oceanic and Atmospheric Administration). 2021. Global ocean heat and salt content. Accessed February 2021. www.nodc.noaa.gov/OC5/3M_HEAT_CONTENT . Based on: + * Levitus, S., J.I. Antonov, T.P. Boyer, O.K. Baranova, H.E. Garcia, R.A. Locarnini, A.V. Mishonov, J.R. Reagan, D. Seidov, E.S. Yarosh, and M.M. Zweng. 2012. World ocean heat content and thermosteric sea level change (0-2000 m), 1955-2010. Geophys. Res. Lett. 39:L10603. https://www.ncei.noaa.gov/data/oceans/woa/PUBLICATIONS/grlheat12.pdf + * CSIRO (Commonwealth Scientific and Industrial Research Organisation). 2016 update to data originally published in: Domingues, C.M., J.A. Church, N.J. White, P.J. Gleckler, S.E. Wijffels, P.M. Barker, and J.R. Dunn. 2008. Improved estimates of upper-ocean warming and multi-decadal sea-level rise. Nature 453:1090-1094. www.cmar.csiro.au/sealevel/thermal_expansion_ocean_heat_timeseries.html + * IAP (Institute of Atmospheric Physics). 2021 update to data originally published in: Cheng, L., K.E. Trenberth, J. Fasullo, T. Boyer, J. Abraham, and J. Zhu. 2017. Improved estimates of ocean heat content from 1960 to 2015. Science Advances 3(3):e1601545. + * MRI/JMA (Meteorological Research Institute/Japan Meteorological Agency). 2021. Global ocean heat content. Accessed February 2021. www.data.jma.go.jp/gmd/kaiyou/english/ohc/ohc_global_en.html. + + More details can be found on [their technical documentation](https://www.epa.gov/sites/default/files/2021-04/documents/ocean-heat_td.pdf). + attribution: EPA based on various sources (2021) + attribution_short: EPA + url_main: https://www.epa.gov/climate-indicators/climate-change-indicators-ocean-heat + url_download: https://www.epa.gov/sites/default/files/2021-04/ocean-heat_fig-1.csv + date_accessed: '2024-04-17' + date_published: '2021-04-01' + license: + name: Public domain + url: https://edg.epa.gov/epa_data_license.html +wdir: ../../../data/snapshots/epa/2024-04-17 +outs: + - md5: 909371e229b3a4f40427659b376d5432 + size: 3919 + path: ocean_heat_content_annual_world_700m.csv From 789858cce9669fd78cd3488341eeeb91f0ef1b54 Mon Sep 17 00:00:00 2001 From: owidbot Date: Wed, 17 Apr 2024 12:20:40 +0000 Subject: [PATCH 39/40] fasttrack: fasttrack/2024-04-17/qubits.csv --- .../fasttrack/2024-04-17/qubits.meta.yml | 25 ++----------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml index 40283d1b22d..c2bbb6cba34 100644 --- a/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml +++ b/etl/steps/data/grapher/fasttrack/2024-04-17/qubits.meta.yml @@ -14,32 +14,11 @@ tables: short_unit: qb display: numDecimalPlaces: 0 - description_key: - - '[' - - '"' - - t - - e - - s - - t - - '' - - '1' - - '"' - - ',' - - '' - - '"' - - t - - e - - s - - t - - '' - - '2' - - '"' - - ']' + description_short: Highest number of quantum bits in a single circuit-based quantum processor over time. description_from_producer: |- - This list contains quantum processors, also known as quantum processing units (QPUs). Some devices listed below have only been announced at press conferences so far, with no actual demonstrations or scientific publications characterizing the performance. + This list contains quantum processors, also known as quantum processing units (QPUs). Some devices listed have only been announced at press conferences so far, with no actual demonstrations or scientific publications characterizing the performance. Quantum processors are difficult to compare due to the different architectures and approaches. Due to this, published qubit numbers do not reflect the performance levels of the processor. This is instead achieved through benchmarking metrics such as quantum volume, randomized benchmarking or circuit layer operations per second (CLOPS). These QPUs are based on the quantum circuit and quantum logic gate-based model of computing. processing_level: minor - description: Highest number of quantum bits in a single circuit-based quantum processor over time From e706587bfb2bd09cbe88850368eb3a3200b7ac2f Mon Sep 17 00:00:00 2001 From: owidbot Date: Wed, 17 Apr 2024 13:00:40 +0000 Subject: [PATCH 40/40] fasttrack: fasttrack/2024-04-17/qubits.csv