diff --git a/.gitignore b/.gitignore index 0e29c71e468..0b877817eda 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ site/ .wizardcfg/* .streamlit/* .ipynb_lock +.execution_time.json + diff --git a/Makefile b/Makefile index 4df3585ed72..a6255def36e 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,7 @@ help: @echo ' make format-all Format code (including modules in lib/)' @echo ' make full Fetch all data and run full transformations' @echo ' make grapher Publish supported datasets to Grapher' + @echo ' make sync.catalog Sync catalog from R2 into local data/ folder' @echo ' make lab Start a Jupyter Lab server' @echo ' make publish Publish the generated catalog to S3' @echo ' make api Start the ETL API on port 8081' @@ -118,6 +119,14 @@ prune: .venv @echo '==> Prune datasets with no recipe from catalog' poetry run etl d prune +# Syncing catalog is useful if you want to avoid rebuilding it locally from scratch +# which could take a few hours. This will download ~10gb from the main channels +# (meadow, garden, open_numbers) and is especially useful when we increase ETL_EPOCH +# or update regions. +sync.catalog: .venv + @echo '==> Sync catalog from R2 into local data/ folder (~10gb)' + rclone copy owid-r2:owid-catalog/ data/ --verbose --fast-list --transfers=64 --checkers=64 --include "/meadow/**" --include "/garden/**" --include "/open_numbers/**" + grapher: .venv @echo '==> Running full etl with grapher upsert' poetry run etl run --grapher diff --git a/api/v1/__init__.py b/api/v1/__init__.py index a4443197f3b..a9877e90347 100644 --- a/api/v1/__init__.py +++ b/api/v1/__init__.py @@ -179,8 +179,12 @@ def _indicator_metadata_dict(indicator: Indicator, db_indicator: gm.Variable) -> indicator_update_dict = indicator.to_meta_dict() update_period_days = indicator_update_dict.pop("update_period_days", None) + # if indicator has dimensions, use its original name + original_short_name = (db_indicator.dimensions or {}).get("originalShortName") + short_name = original_short_name or db_indicator.shortName + # create dictionary for metadata - meta_dict = {"tables": {db_indicator.table_name: {"variables": {db_indicator.shortName: indicator_update_dict}}}} + meta_dict = {"tables": {db_indicator.table_name: {"variables": {short_name: indicator_update_dict}}}} if update_period_days: meta_dict["dataset"] = {"update_period_days": update_period_days} diff --git a/apps/backport/backport.py b/apps/backport/backport.py index bb0186581f4..b3fbe2c2a65 100644 --- a/apps/backport/backport.py +++ b/apps/backport/backport.py @@ -20,7 +20,7 @@ from etl import config, paths from etl import grapher_model as gm from etl.backport_helpers import GrapherConfig -from etl.db import get_engine +from etl.db import get_engine, read_sql from etl.files import checksum_str from etl.snapshot import Snapshot, SnapshotMeta @@ -346,7 +346,7 @@ def _load_values(engine: Engine, variable_ids: list[int]) -> pd.DataFrame: "entityCode": "entity_code", } ) - vf: pd.DataFrame = pd.read_sql(q, engine, params={"variable_ids": variable_ids}) + vf = read_sql(q, engine, params={"variable_ids": variable_ids}) df = df.merge(vf, on="variable_id") # try converting values to float if possible, this can make the data 50% smaller diff --git a/apps/backport/bulk_backport.py b/apps/backport/bulk_backport.py index c9d61ceacfe..bb10fd859f0 100644 --- a/apps/backport/bulk_backport.py +++ b/apps/backport/bulk_backport.py @@ -9,7 +9,7 @@ from sqlalchemy.engine import Engine from etl import config -from etl.db import get_engine +from etl.db import get_engine, read_sql from etl.snapshot import snapshot_catalog from etl.steps import load_dag @@ -195,7 +195,7 @@ def _active_datasets( limit %(limit)s """ - df = pd.read_sql( + df = read_sql( q, engine, params={ diff --git a/apps/backport/datasync/data_metadata.py b/apps/backport/datasync/data_metadata.py index 190a6d32a85..f1d4fe3e725 100644 --- a/apps/backport/datasync/data_metadata.py +++ b/apps/backport/datasync/data_metadata.py @@ -83,7 +83,13 @@ def add_entity_code_and_name(session: Session, df: pd.DataFrame) -> pd.DataFrame df["entityCode"] = [] return df - entities = _fetch_entities(session, list(df["entityId"].unique())) + unique_entities = df["entityId"].unique() + + entities = _fetch_entities(session, list(unique_entities)) + + if set(unique_entities) - set(entities.entityId): + missing_entities = set(unique_entities) - set(entities.entityId) + raise ValueError(f"Missing entities in the database: {missing_entities}") return pd.merge(df, entities, on="entityId") diff --git a/apps/metadata_migrate/cli.py b/apps/metadata_migrate/cli.py index 48201ac4dcc..69aa7abba17 100644 --- a/apps/metadata_migrate/cli.py +++ b/apps/metadata_migrate/cli.py @@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional import click -import pandas as pd import structlog from owid.catalog import Dataset, DatasetMeta, License, Origin, Source, Table from rich import print @@ -16,7 +15,7 @@ from etl import config from etl import grapher_model as gm from etl.command import main as etl_main -from etl.db import get_engine +from etl.db import get_engine, read_sql from etl.metadata_export import merge_or_create_yaml, reorder_fields from etl.paths import BASE_DIR, DAG_FILE, DATA_DIR, STEP_DIR @@ -108,7 +107,7 @@ def cli( select config from charts where slug = '{chart_slug}' """ - df = pd.read_sql(q, engine) + df = read_sql(q, engine) if df.empty: raise ValueError(f"no chart found for slug {chart_slug}") @@ -359,7 +358,7 @@ def _load_grapher_config(engine: Engine, col: str, ds_meta: DatasetMeta) -> Dict d.version = '{ds_meta.version}' and d.shortName = '{ds_meta.short_name}' """ - cf = pd.read_sql(q, engine) + cf = read_sql(q, engine) if len(cf) == 0: log.warning(f"no chart found for variable {col}") return {} diff --git a/apps/owidbot/__init__.py b/apps/owidbot/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/apps/owidbot/etldiff.py b/apps/owidbot/etldiff.py new file mode 100644 index 00000000000..266bcd816a6 --- /dev/null +++ b/apps/owidbot/etldiff.py @@ -0,0 +1,195 @@ +import datetime as dt +import subprocess +import time +from typing import Tuple + +import click +import structlog +from github import Auth, Github +from rich import print +from rich.ansi import AnsiDecoder +from rich_click.rich_command import RichCommand + +from apps.staging_sync.cli import _get_container_name +from etl import config +from etl.paths import BASE_DIR + +log = structlog.get_logger() + + +EXCLUDE_DATASETS = "weekly_wildfires|excess_mortality|covid|fluid|flunet|country_profile" + + +@click.command(name="owidbot-etl-diff", cls=RichCommand, help=__doc__) +@click.option( + "--branch", + type=str, +) +@click.option( + "--include", + type=str, + default="garden", + help="Include datasets matching this regex.", +) +@click.option( + "--dry-run/--no-dry-run", + default=False, + type=bool, + help="Print to console, do not post to Github.", +) +def cli( + branch: str, + include: str, + dry_run: bool, +) -> None: + """Post result of `etl diff` to Github PR. + + Example: + + ``` + $ python apps/owidbot/etldiff.py --branch my-branch + ``` + """ + t = time.time() + + lines = call_etl_diff(include) + diff, result = format_etl_diff(lines) + + container_name = _get_container_name(branch) if branch else "dry-run" + + # TODO: only include site-screenshots if the PR is from owid-grapher. Similarly, don't + # run etl diff if the PR is from etl repo. + # - **Site-screenshots**: https://github.com/owid/site-screenshots/compare/{nbranch} + + body = f""" + + +Staging server: + +- **Admin**: http://{container_name}/admin/login +- **Site**: http://{container_name}/ +- **Login**: `ssh owid@{container_name}` + + + + +etl diff: {result} + +```diff +{diff} +``` + +Automatically updated datasets matching _{EXCLUDE_DATASETS}_ are not included + + +_Edited: {dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")} UTC_ +_Execution time: {time.time() - t:.2f} seconds_ + """.strip() + + if dry_run: + print(body) + else: + post_comment_to_pr(branch, body) + + +def post_comment_to_pr(branch_name: str, body: str) -> None: + assert config.OWIDBOT_ACCESS_TOKEN + auth = Auth.Token(config.OWIDBOT_ACCESS_TOKEN) + g = Github(auth=auth) + + repo = g.get_repo("owid/etl") + + # Find pull requests for the branch (assuming you're looking for open PRs) + pulls = repo.get_pulls(state="open", sort="created", head=f"{repo.owner.login}:{branch_name}") + pulls = list(pulls) + + if len(pulls) == 0: + raise AssertionError(f"No open PR found for branch {branch_name}") + elif len(pulls) > 1: + raise AssertionError(f"More than one open PR found for branch {branch_name}") + + pr = pulls[0] + + comments = pr.get_issue_comments() + + owidbot_comments = [comment for comment in comments if comment.user.login == "owidbot"] + + if len(owidbot_comments) == 0: + pr.create_issue_comment(body=body) + elif len(owidbot_comments) == 1: + owidbot_comment = owidbot_comments[0] + owidbot_comment.edit(body=body) + else: + raise AssertionError("More than one owidbot comment found.") + + +def format_etl_diff(lines: list[str]) -> Tuple[str, str]: + new_lines = [] + result = "" + for line in lines: + # extract result + if line and line[0] in ("✅", "❌", "⚠️", "❓"): + result = line + continue + + # skip some lines + if "this may get slow" in line or "comparison with compare" in line: + continue + + if line.strip().startswith("-"): + line = "-" + line[1:] + if line.strip().startswith("+"): + line = "+" + line[1:] + + new_lines.append(line) + + diff = "\n".join(new_lines) + + # NOTE: we don't need this anymore, we now have consistent checksums on local and remote + # Some datasets might have different checksum, but be the same (this is caused by checksum_input and checksum_output + # problem). Hotfix this by removing matching datasets from the output. + # Example: + # = Dataset meadow/agriculture/2024-03-26/attainable_yields + # = Table attainable_yields + # = Dataset garden/agriculture/2024-03-26/attainable_yields + # = Table attainable_yields + # ~ Column A + # = Dataset grapher/agriculture/2024-03-26/attainable_yields + # = Table attainable_yields + # pattern = r"(= Dataset.*(?:\n\s+=.*)+)\n(?=. Dataset|\n)" + # diff = re.sub(pattern, "", diff) + + return diff, result + + +def call_etl_diff(include: str) -> list[str]: + cmd = [ + "poetry", + "run", + "etl", + "diff", + "REMOTE", + "data/", + "--include", + include, + "--exclude", + EXCLUDE_DATASETS, + "--verbose", + "--workers", + "3", + ] + + result = subprocess.Popen(cmd, cwd=BASE_DIR, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = result.communicate() + + stdout = stdout.decode() + stderr = stderr.decode() + + if stderr: + raise Exception(f"Error: {stderr}") + + return [str(line) for line in AnsiDecoder().decode(stdout)] + + +if __name__ == "__main__": + cli() diff --git a/apps/staging_sync/cli.py b/apps/staging_sync/cli.py index df577eb68d5..b10e0968b1a 100644 --- a/apps/staging_sync/cli.py +++ b/apps/staging_sync/cli.py @@ -18,7 +18,7 @@ from etl import grapher_model as gm from etl.config import GRAPHER_USER_ID from etl.datadiff import _dict_diff -from etl.db import Engine, get_engine +from etl.db import Engine, get_engine, read_sql from .admin_api import AdminAPI @@ -404,7 +404,7 @@ def _modified_chart_ids_by_admin(session: Session) -> Set[int]: select id from charts where publishedAt is not null ) """ - return set(pd.read_sql(q, session.bind).chartId.tolist()) + return set(read_sql(q, session.bind).chartId.tolist()) # type: ignore def _get_git_branch_creation_date(branch_name: str) -> dt.datetime: diff --git a/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py index 2464ce7261a..fad0174bcdc 100644 --- a/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py +++ b/apps/wizard/etl_steps/cookiecutter/garden/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py @@ -23,7 +23,7 @@ def run(dest_dir: str) -> None: tb = geo.harmonize_countries( df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path ) - tb = tb.set_index(["country", "year"], verify_integrity=True) + tb = tb.format(["country", "year"]) # # Save outputs. diff --git a/apps/wizard/etl_steps/cookiecutter/meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py b/apps/wizard/etl_steps/cookiecutter/meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py index c318ba23bc9..66b81ab09f5 100644 --- a/apps/wizard/etl_steps/cookiecutter/meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py +++ b/apps/wizard/etl_steps/cookiecutter/meadow/{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.py @@ -20,7 +20,7 @@ def run(dest_dir: str) -> None: # Process data. # # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. - tb = tb.underscore().set_index(["country", "year"], verify_integrity=True).sort_index() + tb = tb.format(["country", "year"]) # # Save outputs. diff --git a/dag/archive/climate.yml b/dag/archive/climate.yml index 64995e71adc..ff8675494a9 100644 --- a/dag/archive/climate.yml +++ b/dag/archive/climate.yml @@ -18,3 +18,232 @@ steps: - snapshot://imbie/2024-01-02/ice_sheet_mass_balance_greenland.csv data://garden/imbie/2024-01-02/ice_sheet_mass_balance: - data://meadow/imbie/2024-01-02/ice_sheet_mass_balance + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://meadow/climate/2024-01-31/sea_surface_temperature: + - snapshot://climate/2024-01-31/sea_surface_temperature_world.csv + - snapshot://climate/2024-01-31/sea_surface_temperature_northern_hemisphere.csv + - snapshot://climate/2024-01-31/sea_surface_temperature_southern_hemisphere.csv + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://garden/climate/2024-01-31/sea_surface_temperature: + - data://meadow/climate/2024-01-31/sea_surface_temperature + # + # GISS - Surface temperature analysis. + # + data://meadow/climate/2024-01-31/surface_temperature_analysis: + - snapshot://climate/2024-01-31/surface_temperature_analysis_world.csv + - snapshot://climate/2024-01-31/surface_temperature_analysis_northern_hemisphere.csv + - snapshot://climate/2024-01-31/surface_temperature_analysis_southern_hemisphere.csv + # + # GISS - Surface temperature analysis. + # + data://garden/climate/2024-01-31/surface_temperature_analysis: + - data://meadow/climate/2024-01-31/surface_temperature_analysis + # + # NSIDC - Arctic sea ice extent. + # + data://meadow/climate/2024-01-31/sea_ice_index: + - snapshot://climate/2024-01-31/sea_ice_index.xlsx + # + # NSIDC - Arctic sea ice extent. + # + data://garden/climate/2024-01-31/sea_ice_index: + - data://meadow/climate/2024-01-31/sea_ice_index + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://meadow/climate/2024-01-31/ocean_heat_content: + - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_700m.csv + - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_2000m.csv + - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_700m.csv + - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_2000m.csv + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://garden/climate/2024-01-31/ocean_heat_content: + - data://meadow/climate/2024-01-31/ocean_heat_content + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://meadow/climate/2024-01-31/hawaii_ocean_time_series: + - snapshot://climate/2024-01-31/hawaii_ocean_time_series.csv + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://garden/climate/2024-01-31/ocean_ph_levels: + - data://meadow/climate/2024-01-31/hawaii_ocean_time_series + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://meadow/climate/2024-01-31/snow_cover_extent: + - snapshot://climate/2024-01-31/snow_cover_extent_north_america.csv + - snapshot://climate/2024-01-31/snow_cover_extent_northern_hemisphere.csv + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://garden/climate/2024-01-31/snow_cover_extent: + - data://meadow/climate/2024-01-31/snow_cover_extent + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://meadow/climate/2024-01-31/ghg_concentration: + - snapshot://climate/2024-01-31/co2_concentration_monthly.csv + - snapshot://climate/2024-01-31/ch4_concentration_monthly.csv + - snapshot://climate/2024-01-31/n2o_concentration_monthly.csv + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://garden/climate/2024-01-31/ghg_concentration: + - data://meadow/climate/2024-01-31/ghg_concentration + # + # Various sources - Long-run greenhouse gas concentration. + # + data://garden/climate/2024-01-31/long_run_ghg_concentration: + - data://garden/epa/2024-01-29/ghg_concentration + - data://garden/climate/2024-01-31/ghg_concentration + # + # Various sources - Climate change impacts. + # + data://garden/climate/2024-01-31/climate_change_impacts: + - data://garden/climate/2024-01-31/surface_temperature_analysis + - data://garden/climate/2024-01-31/sea_ice_index + - data://garden/climate/2024-01-31/sea_surface_temperature + - data://garden/climate/2024-01-31/ocean_heat_content + - data://garden/climate/2024-01-31/ocean_ph_levels + - data://garden/climate/2024-01-31/snow_cover_extent + - data://garden/climate/2024-01-31/ghg_concentration + - data://garden/climate/2024-01-31/long_run_ghg_concentration + - data://garden/climate/2024-01-28/global_sea_level + - data://garden/epa/2024-01-29/ocean_heat_content + - data://garden/epa/2024-01-29/ice_sheet_mass_balance + - data://garden/epa/2024-01-29/mass_balance_us_glaciers + # + # Various sources - Climate change impacts (annual). + # + data://grapher/climate/2024-01-31/climate_change_impacts_annual: + - data://garden/climate/2024-01-31/climate_change_impacts + # + # Various sources - Climate change impacts (monthly). + # + data://grapher/climate/2024-01-31/climate_change_impacts_monthly: + - data://garden/climate/2024-01-31/climate_change_impacts + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://meadow/climate/2024-03-11/hawaii_ocean_time_series: + - snapshot://climate/2024-03-11/hawaii_ocean_time_series.csv + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://meadow/climate/2024-03-11/ocean_heat_content: + - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_700m.csv + - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_2000m.csv + - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_700m.csv + - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_2000m.csv + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://meadow/climate/2024-03-11/snow_cover_extent: + - snapshot://climate/2024-03-11/snow_cover_extent_northern_hemisphere.csv + - snapshot://climate/2024-03-11/snow_cover_extent_north_america.csv + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://meadow/climate/2024-03-11/sea_surface_temperature: + - snapshot://climate/2024-03-11/sea_surface_temperature_northern_hemisphere.csv + - snapshot://climate/2024-03-11/sea_surface_temperature_southern_hemisphere.csv + - snapshot://climate/2024-03-11/sea_surface_temperature_world.csv + # + # NSIDC - Arctic sea ice extent. + # + data://meadow/climate/2024-03-11/sea_ice_index: + - snapshot://climate/2024-03-11/sea_ice_index.xlsx + # + # GISS - Surface temperature analysis. + # + data://meadow/climate/2024-03-11/surface_temperature_analysis: + - snapshot://climate/2024-03-11/surface_temperature_analysis_southern_hemisphere.csv + - snapshot://climate/2024-03-11/surface_temperature_analysis_northern_hemisphere.csv + - snapshot://climate/2024-03-11/surface_temperature_analysis_world.csv + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://meadow/climate/2024-03-11/ghg_concentration: + - snapshot://climate/2024-03-11/n2o_concentration_monthly.csv + - snapshot://climate/2024-03-11/co2_concentration_monthly.csv + - snapshot://climate/2024-03-11/ch4_concentration_monthly.csv + # + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # + data://garden/climate/2024-03-11/ocean_ph_levels: + - data://meadow/climate/2024-03-11/hawaii_ocean_time_series + # + # NOAA National Centers for Environmental Information - Ocean Heat Content. + # + data://garden/climate/2024-03-11/ocean_heat_content: + - data://meadow/climate/2024-03-11/ocean_heat_content + # + # Rutgers University Global Snow Lab - Snow Cover Extent. + # + data://garden/climate/2024-03-11/snow_cover_extent: + - data://meadow/climate/2024-03-11/snow_cover_extent + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://garden/climate/2024-03-11/sea_surface_temperature: + - data://meadow/climate/2024-03-11/sea_surface_temperature + # + # NSIDC - Arctic sea ice extent. + # + data://garden/climate/2024-03-11/sea_ice_index: + - data://meadow/climate/2024-03-11/sea_ice_index + # + # GISS - Surface temperature analysis. + # + data://garden/climate/2024-03-11/surface_temperature_analysis: + - data://meadow/climate/2024-03-11/surface_temperature_analysis + # + # NOAA Global Monitoring Laboratory - GHG concentration. + # + data://garden/climate/2024-03-11/ghg_concentration: + - data://meadow/climate/2024-03-11/ghg_concentration + # + # Various sources - Long-run greenhouse gas concentration. + # + data://garden/climate/2024-03-11/long_run_ghg_concentration: + - data://garden/epa/2024-01-29/ghg_concentration + - data://garden/climate/2024-03-11/ghg_concentration + # + # Various sources - Climate change impacts. + # + data://garden/climate/2024-03-11/climate_change_impacts: + - data://garden/climate/2024-03-11/long_run_ghg_concentration + - data://garden/climate/2024-01-28/global_sea_level + - data://garden/epa/2024-01-29/ice_sheet_mass_balance + - data://garden/epa/2024-01-29/mass_balance_us_glaciers + - data://garden/epa/2024-01-29/ocean_heat_content + - data://garden/climate/2024-03-11/ocean_heat_content + - data://garden/climate/2024-03-11/surface_temperature_analysis + - data://garden/climate/2024-03-11/sea_ice_index + - data://garden/climate/2024-03-11/ghg_concentration + - data://garden/climate/2024-03-11/ocean_ph_levels + - data://garden/climate/2024-03-11/sea_surface_temperature + - data://garden/climate/2024-03-11/snow_cover_extent + # + # Various sources - Climate change impacts (monthly). + # + data://grapher/climate/2024-03-11/climate_change_impacts_monthly: + - data://garden/climate/2024-03-11/climate_change_impacts + # + # Various sources - Climate change impacts (annual). + # + data://grapher/climate/2024-03-11/climate_change_impacts_annual: + - data://garden/climate/2024-03-11/climate_change_impacts + # + # GISS - Surface temperature analysis. + # + data://grapher/climate/latest/surface_temperature_analysis: + - data://garden/climate/2024-03-11/surface_temperature_analysis diff --git a/dag/archive/emissions.yml b/dag/archive/emissions.yml index d9495ba2a29..cf0b2e82571 100644 --- a/dag/archive/emissions.yml +++ b/dag/archive/emissions.yml @@ -138,3 +138,28 @@ steps: - data://garden/wb/2023-04-30/income_groups data://grapher/gcp/2023-12-05/global_carbon_budget: - data://garden/gcp/2023-12-05/global_carbon_budget + # + # Emissions - CO2 dataset (2023-12-12). + # + data://garden/emissions/2023-12-12/owid_co2: + - data://garden/emissions/2023-11-23/national_contributions + - data://garden/gcp/2023-12-12/global_carbon_budget + - data://garden/climate_watch/2023-10-31/emissions_by_sector + - data://garden/energy/2023-12-12/primary_energy_consumption + - data://garden/demography/2023-03-31/population + - data://garden/ggdc/2020-10-01/ggdc_maddison + - data://garden/regions/2023-01-01/regions + # + # Jones et al. (2023) - National contributions to climate change. + # + data://meadow/emissions/2023-11-23/national_contributions: + - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv + - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv + - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv + data://garden/emissions/2023-11-23/national_contributions: + - data://meadow/emissions/2023-11-23/national_contributions + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population + - data://garden/wb/2023-04-30/income_groups + data://grapher/emissions/2023-11-23/national_contributions: + - data://garden/emissions/2023-11-23/national_contributions diff --git a/dag/archive/poverty_inequality.yml b/dag/archive/poverty_inequality.yml new file mode 100644 index 00000000000..0d09af23528 --- /dev/null +++ b/dag/archive/poverty_inequality.yml @@ -0,0 +1,8 @@ +steps: + # OECD Income Distribution Database + data://meadow/oecd/2023-06-06/income_distribution_database: + - snapshot://oecd/2023-06-06/income_distribution_database.csv + data://garden/oecd/2023-06-06/income_distribution_database: + - data://meadow/oecd/2023-06-06/income_distribution_database + data://grapher/oecd/2023-06-06/income_distribution_database: + - data://garden/oecd/2023-06-06/income_distribution_database diff --git a/dag/climate.yml b/dag/climate.yml index 0dcb469b54a..8dd0e2f1a32 100644 --- a/dag/climate.yml +++ b/dag/climate.yml @@ -39,7 +39,7 @@ steps: # Copernicus Climate Change Service - Surface temperature. # data://meadow/climate/2023-12-20/surface_temperature: - - snapshot://climate/2024-03-12/surface_temperature.gz + - snapshot://climate/2024-04-12/surface_temperature.gz - snapshot://countries/2023-12-27/world_bank.zip data://garden/climate/2023-12-20/surface_temperature: - data://meadow/climate/2023-12-20/surface_temperature @@ -54,127 +54,10 @@ steps: data://grapher/climate/2023-12-20/surface_temperature_annual_average: - data://garden/climate/2023-12-20/surface_temperature # - # Met Office Hadley Centre - Sea surface temperature. - # - data://meadow/climate/2024-01-31/sea_surface_temperature: - - snapshot://climate/2024-01-31/sea_surface_temperature_world.csv - - snapshot://climate/2024-01-31/sea_surface_temperature_northern_hemisphere.csv - - snapshot://climate/2024-01-31/sea_surface_temperature_southern_hemisphere.csv - # - # Met Office Hadley Centre - Sea surface temperature. - # - data://garden/climate/2024-01-31/sea_surface_temperature: - - data://meadow/climate/2024-01-31/sea_surface_temperature - # - # GISS - Surface temperature analysis. - # - data://meadow/climate/2024-01-31/surface_temperature_analysis: - - snapshot://climate/2024-01-31/surface_temperature_analysis_world.csv - - snapshot://climate/2024-01-31/surface_temperature_analysis_northern_hemisphere.csv - - snapshot://climate/2024-01-31/surface_temperature_analysis_southern_hemisphere.csv - # - # GISS - Surface temperature analysis. - # - data://garden/climate/2024-01-31/surface_temperature_analysis: - - data://meadow/climate/2024-01-31/surface_temperature_analysis - # - # GISS - Surface temperature analysis. - # - data://grapher/climate/latest/surface_temperature_analysis: - - data://garden/climate/2024-01-31/surface_temperature_analysis - # - # NSIDC - Arctic sea ice extent. - # - data://meadow/climate/2024-01-31/sea_ice_index: - - snapshot://climate/2024-01-31/sea_ice_index.xlsx - # - # NSIDC - Arctic sea ice extent. - # - data://garden/climate/2024-01-31/sea_ice_index: - - data://meadow/climate/2024-01-31/sea_ice_index - # - # NOAA National Centers for Environmental Information - Ocean Heat Content. - # - data://meadow/climate/2024-01-31/ocean_heat_content: - - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_700m.csv - - snapshot://climate/2024-01-31/ocean_heat_content_monthly_world_2000m.csv - - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_700m.csv - - snapshot://climate/2024-01-31/ocean_heat_content_annual_world_2000m.csv - # - # NOAA National Centers for Environmental Information - Ocean Heat Content. - # - data://garden/climate/2024-01-31/ocean_heat_content: - - data://meadow/climate/2024-01-31/ocean_heat_content - # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). - # - data://meadow/climate/2024-01-31/hawaii_ocean_time_series: - - snapshot://climate/2024-01-31/hawaii_ocean_time_series.csv - # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). - # - data://garden/climate/2024-01-31/ocean_ph_levels: - - data://meadow/climate/2024-01-31/hawaii_ocean_time_series - # - # Rutgers University Global Snow Lab - Snow Cover Extent. - # - data://meadow/climate/2024-01-31/snow_cover_extent: - - snapshot://climate/2024-01-31/snow_cover_extent_north_america.csv - - snapshot://climate/2024-01-31/snow_cover_extent_northern_hemisphere.csv - # - # Rutgers University Global Snow Lab - Snow Cover Extent. - # - data://garden/climate/2024-01-31/snow_cover_extent: - - data://meadow/climate/2024-01-31/snow_cover_extent - # - # NOAA Global Monitoring Laboratory - GHG concentration. - # - data://meadow/climate/2024-01-31/ghg_concentration: - - snapshot://climate/2024-01-31/co2_concentration_monthly.csv - - snapshot://climate/2024-01-31/ch4_concentration_monthly.csv - - snapshot://climate/2024-01-31/n2o_concentration_monthly.csv - # - # NOAA Global Monitoring Laboratory - GHG concentration. - # - data://garden/climate/2024-01-31/ghg_concentration: - - data://meadow/climate/2024-01-31/ghg_concentration - # - # Various sources - Long-run greenhouse gas concentration. - # - data://garden/climate/2024-01-31/long_run_ghg_concentration: - - data://garden/epa/2024-01-29/ghg_concentration - - data://garden/climate/2024-01-31/ghg_concentration - # - # Various sources - Climate change impacts. - # - data://garden/climate/2024-01-31/climate_change_impacts: - - data://garden/climate/2024-01-31/surface_temperature_analysis - - data://garden/climate/2024-01-31/sea_ice_index - - data://garden/climate/2024-01-31/sea_surface_temperature - - data://garden/climate/2024-01-31/ocean_heat_content - - data://garden/climate/2024-01-31/ocean_ph_levels - - data://garden/climate/2024-01-31/snow_cover_extent - - data://garden/climate/2024-01-31/ghg_concentration - - data://garden/climate/2024-01-31/long_run_ghg_concentration - - data://garden/climate/2024-01-28/global_sea_level - - data://garden/epa/2024-01-29/ocean_heat_content - - data://garden/epa/2024-01-29/ice_sheet_mass_balance - - data://garden/epa/2024-01-29/mass_balance_us_glaciers - # - # Various sources - Climate change impacts (annual). - # - data://grapher/climate/2024-01-31/climate_change_impacts_annual: - - data://garden/climate/2024-01-31/climate_change_impacts - # - # Various sources - Climate change impacts (monthly). - # - data://grapher/climate/2024-01-31/climate_change_impacts_monthly: - - data://garden/climate/2024-01-31/climate_change_impacts - # # Climate change impacts data explorer. # data://explorers/climate/latest/climate_change_impacts: - - data://garden/climate/2024-03-11/climate_change_impacts + - data://garden/climate/2024-04-17/climate_change_impacts # # Global Wildfire Information System - Monthly burned area. # @@ -235,114 +118,136 @@ steps: data://grapher/met_office_hadley_centre/2024-03-04/near_surface_temperature: - data://garden/met_office_hadley_centre/2024-03-04/near_surface_temperature # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # EPA - Climate change indicators (possibly not updateable). # - data://meadow/climate/2024-03-11/hawaii_ocean_time_series: - - snapshot://climate/2024-03-11/hawaii_ocean_time_series.csv + data://meadow/epa/2024-04-17/ocean_heat_content: + - snapshot://epa/2024-04-17/ocean_heat_content_annual_world_700m.csv + - snapshot://epa/2024-04-17/ocean_heat_content_annual_world_2000m.csv + data://garden/epa/2024-04-17/ocean_heat_content: + - data://meadow/epa/2024-04-17/ocean_heat_content + data://meadow/epa/2024-04-17/ice_sheet_mass_balance: + - snapshot://epa/2024-04-17/ice_sheet_mass_balance.csv + data://garden/epa/2024-04-17/ice_sheet_mass_balance: + - data://meadow/epa/2024-04-17/ice_sheet_mass_balance + data://meadow/epa/2024-04-17/ghg_concentration: + - snapshot://epa/2024-04-17/co2_concentration.csv + - snapshot://epa/2024-04-17/ch4_concentration.csv + - snapshot://epa/2024-04-17/n2o_concentration.csv + data://garden/epa/2024-04-17/ghg_concentration: + - data://meadow/epa/2024-04-17/ghg_concentration + data://meadow/epa/2024-04-17/mass_balance_us_glaciers: + - snapshot://epa/2024-04-17/mass_balance_us_glaciers.csv + data://garden/epa/2024-04-17/mass_balance_us_glaciers: + - data://meadow/epa/2024-04-17/mass_balance_us_glaciers # - # NOAA National Centers for Environmental Information - Ocean Heat Content. + # Rutgers University Global Snow Lab - Snow Cover Extent. # - data://meadow/climate/2024-03-11/ocean_heat_content: - - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_700m.csv - - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_2000m.csv - - snapshot://climate/2024-03-11/ocean_heat_content_annual_world_700m.csv - - snapshot://climate/2024-03-11/ocean_heat_content_monthly_world_2000m.csv + data://meadow/climate/2024-04-17/snow_cover_extent: + - snapshot://climate/2024-04-17/snow_cover_extent_north_america.csv + - snapshot://climate/2024-04-17/snow_cover_extent_northern_hemisphere.csv # - # Rutgers University Global Snow Lab - Snow Cover Extent. + # NOAA National Centers for Environmental Information - Ocean Heat Content. # - data://meadow/climate/2024-03-11/snow_cover_extent: - - snapshot://climate/2024-03-11/snow_cover_extent_northern_hemisphere.csv - - snapshot://climate/2024-03-11/snow_cover_extent_north_america.csv + data://meadow/climate/2024-04-17/ocean_heat_content: + - snapshot://climate/2024-04-17/ocean_heat_content_annual_world_2000m.csv + - snapshot://climate/2024-04-17/ocean_heat_content_monthly_world_700m.csv + - snapshot://climate/2024-04-17/ocean_heat_content_annual_world_700m.csv + - snapshot://climate/2024-04-17/ocean_heat_content_monthly_world_2000m.csv # - # Met Office Hadley Centre - Sea surface temperature. + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). # - data://meadow/climate/2024-03-11/sea_surface_temperature: - - snapshot://climate/2024-03-11/sea_surface_temperature_northern_hemisphere.csv - - snapshot://climate/2024-03-11/sea_surface_temperature_southern_hemisphere.csv - - snapshot://climate/2024-03-11/sea_surface_temperature_world.csv + data://meadow/climate/2024-04-17/hawaii_ocean_time_series: + - snapshot://climate/2024-04-17/hawaii_ocean_time_series.csv # # NSIDC - Arctic sea ice extent. # - data://meadow/climate/2024-03-11/sea_ice_index: - - snapshot://climate/2024-03-11/sea_ice_index.xlsx + data://meadow/climate/2024-04-17/sea_ice_index: + - snapshot://climate/2024-04-17/sea_ice_index.xlsx + # + # Met Office Hadley Centre - Sea surface temperature. + # + data://meadow/climate/2024-04-17/sea_surface_temperature: + - snapshot://climate/2024-04-17/sea_surface_temperature_world.csv + - snapshot://climate/2024-04-17/sea_surface_temperature_southern_hemisphere.csv + - snapshot://climate/2024-04-17/sea_surface_temperature_northern_hemisphere.csv # # GISS - Surface temperature analysis. # - data://meadow/climate/2024-03-11/surface_temperature_analysis: - - snapshot://climate/2024-03-11/surface_temperature_analysis_southern_hemisphere.csv - - snapshot://climate/2024-03-11/surface_temperature_analysis_northern_hemisphere.csv - - snapshot://climate/2024-03-11/surface_temperature_analysis_world.csv + data://meadow/climate/2024-04-17/surface_temperature_analysis: + - snapshot://climate/2024-04-17/surface_temperature_analysis_northern_hemisphere.csv + - snapshot://climate/2024-04-17/surface_temperature_analysis_world.csv + - snapshot://climate/2024-04-17/surface_temperature_analysis_southern_hemisphere.csv # # NOAA Global Monitoring Laboratory - GHG concentration. # - data://meadow/climate/2024-03-11/ghg_concentration: - - snapshot://climate/2024-03-11/n2o_concentration_monthly.csv - - snapshot://climate/2024-03-11/co2_concentration_monthly.csv - - snapshot://climate/2024-03-11/ch4_concentration_monthly.csv + data://meadow/climate/2024-04-17/ghg_concentration: + - snapshot://climate/2024-04-17/co2_concentration_monthly.csv + - snapshot://climate/2024-04-17/n2o_concentration_monthly.csv + - snapshot://climate/2024-04-17/ch4_concentration_monthly.csv # - # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). + # Rutgers University Global Snow Lab - Snow Cover Extent. # - data://garden/climate/2024-03-11/ocean_ph_levels: - - data://meadow/climate/2024-03-11/hawaii_ocean_time_series + data://garden/climate/2024-04-17/snow_cover_extent: + - data://meadow/climate/2024-04-17/snow_cover_extent # # NOAA National Centers for Environmental Information - Ocean Heat Content. # - data://garden/climate/2024-03-11/ocean_heat_content: - - data://meadow/climate/2024-03-11/ocean_heat_content + data://garden/climate/2024-04-17/ocean_heat_content: + - data://meadow/climate/2024-04-17/ocean_heat_content # - # Rutgers University Global Snow Lab - Snow Cover Extent. + # School of Ocean and Earth Science and Technology - Hawaii Ocean Time-series (HOT). # - data://garden/climate/2024-03-11/snow_cover_extent: - - data://meadow/climate/2024-03-11/snow_cover_extent + data://garden/climate/2024-04-17/ocean_ph_levels: + - data://meadow/climate/2024-04-17/hawaii_ocean_time_series # - # Met Office Hadley Centre - Sea surface temperature. + # NSIDC - Arctic sea ice extent. # - data://garden/climate/2024-03-11/sea_surface_temperature: - - data://meadow/climate/2024-03-11/sea_surface_temperature + data://garden/climate/2024-04-17/sea_ice_index: + - data://meadow/climate/2024-04-17/sea_ice_index # - # NSIDC - Arctic sea ice extent. + # Met Office Hadley Centre - Sea surface temperature. # - data://garden/climate/2024-03-11/sea_ice_index: - - data://meadow/climate/2024-03-11/sea_ice_index + data://garden/climate/2024-04-17/sea_surface_temperature: + - data://meadow/climate/2024-04-17/sea_surface_temperature # # GISS - Surface temperature analysis. # - data://garden/climate/2024-03-11/surface_temperature_analysis: - - data://meadow/climate/2024-03-11/surface_temperature_analysis + data://garden/climate/2024-04-17/surface_temperature_analysis: + - data://meadow/climate/2024-04-17/surface_temperature_analysis # # NOAA Global Monitoring Laboratory - GHG concentration. # - data://garden/climate/2024-03-11/ghg_concentration: - - data://meadow/climate/2024-03-11/ghg_concentration + data://garden/climate/2024-04-17/ghg_concentration: + - data://meadow/climate/2024-04-17/ghg_concentration # # Various sources - Long-run greenhouse gas concentration. # - data://garden/climate/2024-03-11/long_run_ghg_concentration: - - data://garden/epa/2024-01-29/ghg_concentration - - data://garden/climate/2024-03-11/ghg_concentration + data://garden/climate/2024-04-17/long_run_ghg_concentration: + - data://garden/epa/2024-04-17/ghg_concentration + - data://garden/climate/2024-04-17/ghg_concentration # # Various sources - Climate change impacts. # - data://garden/climate/2024-03-11/climate_change_impacts: - - data://garden/climate/2024-03-11/long_run_ghg_concentration + data://garden/climate/2024-04-17/climate_change_impacts: + - data://garden/epa/2024-04-17/ocean_heat_content + - data://garden/epa/2024-04-17/mass_balance_us_glaciers + - data://garden/climate/2024-04-17/sea_ice_index - data://garden/climate/2024-01-28/global_sea_level - - data://garden/epa/2024-01-29/ice_sheet_mass_balance - - data://garden/epa/2024-01-29/mass_balance_us_glaciers - - data://garden/epa/2024-01-29/ocean_heat_content - - data://garden/climate/2024-03-11/ocean_heat_content - - data://garden/climate/2024-03-11/surface_temperature_analysis - - data://garden/climate/2024-03-11/sea_ice_index - - data://garden/climate/2024-03-11/ghg_concentration - - data://garden/climate/2024-03-11/ocean_ph_levels - - data://garden/climate/2024-03-11/sea_surface_temperature - - data://garden/climate/2024-03-11/snow_cover_extent + - data://garden/epa/2024-04-17/ice_sheet_mass_balance + - data://garden/climate/2024-04-17/ghg_concentration + - data://garden/climate/2024-04-17/ocean_ph_levels + - data://garden/climate/2024-04-17/surface_temperature_analysis + - data://garden/climate/2024-04-17/snow_cover_extent + - data://garden/climate/2024-04-17/sea_surface_temperature + - data://garden/climate/2024-04-17/ocean_heat_content + - data://garden/climate/2024-04-17/long_run_ghg_concentration # - # Various sources - Climate change impacts (monthly). + # Various sources - Climate change impacts (annual). # - data://grapher/climate/2024-03-11/climate_change_impacts_monthly: - - data://garden/climate/2024-03-11/climate_change_impacts + data://grapher/climate/2024-04-17/climate_change_impacts_annual: + - data://garden/climate/2024-04-17/climate_change_impacts # - # Various sources - Climate change impacts (annual). + # Various sources - Climate change impacts (monthly). # - data://grapher/climate/2024-03-11/climate_change_impacts_annual: - - data://garden/climate/2024-03-11/climate_change_impacts + data://grapher/climate/2024-04-17/climate_change_impacts_monthly: + - data://garden/climate/2024-04-17/climate_change_impacts diff --git a/dag/emissions.yml b/dag/emissions.yml index 623547c5067..0b9e9250739 100644 --- a/dag/emissions.yml +++ b/dag/emissions.yml @@ -45,17 +45,6 @@ steps: data://grapher/gcp/2023-12-12/global_carbon_budget: - data://garden/gcp/2023-12-12/global_carbon_budget # - # Emissions - CO2 dataset (2023-12-12). - # - data://garden/emissions/2023-12-12/owid_co2: - - data://garden/emissions/2023-11-23/national_contributions - - data://garden/gcp/2023-12-12/global_carbon_budget - - data://garden/climate_watch/2023-10-31/emissions_by_sector - - data://garden/energy/2023-12-12/primary_energy_consumption - - data://garden/demography/2023-03-31/population - - data://garden/ggdc/2020-10-01/ggdc_maddison - - data://garden/regions/2023-01-01/regions - # # RFF - World Carbon Pricing (2022-09-14). # data://meadow/rff/2023-10-19/world_carbon_pricing: @@ -76,20 +65,6 @@ steps: data://grapher/rff/2023-10-19/emissions_weighted_carbon_price: - data://garden/rff/2023-10-19/emissions_weighted_carbon_price # - # Jones et al. (2023) - National contributions to climate change. - # - data://meadow/emissions/2023-11-23/national_contributions: - - snapshot://emissions/2023-11-23/national_contributions_annual_emissions.csv - - snapshot://emissions/2023-11-23/national_contributions_cumulative_emissions.csv - - snapshot://emissions/2023-11-23/national_contributions_temperature_response.csv - data://garden/emissions/2023-11-23/national_contributions: - - data://meadow/emissions/2023-11-23/national_contributions - - data://garden/regions/2023-01-01/regions - - data://garden/demography/2023-03-31/population - - data://garden/wb/2023-04-30/income_groups - data://grapher/emissions/2023-11-23/national_contributions: - - data://garden/emissions/2023-11-23/national_contributions - # # IPCC - Emission Factor Database (2023-10-24). # data://meadow/emissions/2023-10-24/emission_factors: @@ -124,6 +99,31 @@ steps: data://garden/emissions/2024-02-26/gdp_and_co2_decoupling: - data://garden/gcp/2023-12-12/global_carbon_budget - data://garden/worldbank_wdi/2023-05-29/wdi + # + # Jones et al. - National contributions to climate change. + # + data://meadow/emissions/2024-04-08/national_contributions: + - snapshot://emissions/2024-04-08/national_contributions_temperature_response.csv + - snapshot://emissions/2024-04-08/national_contributions_cumulative_emissions.csv + - snapshot://emissions/2024-04-08/national_contributions_annual_emissions.csv + data://garden/emissions/2024-04-08/national_contributions: + - data://meadow/emissions/2024-04-08/national_contributions + - data://garden/demography/2023-03-31/population + - data://garden/wb/2024-03-11/income_groups + - data://garden/regions/2023-01-01/regions + data://grapher/emissions/2024-04-08/national_contributions: + - data://garden/emissions/2024-04-08/national_contributions + # + # Emissions - CO2 dataset. + # + data://garden/emissions/latest/owid_co2: + - data://garden/ggdc/2020-10-01/ggdc_maddison + - data://garden/energy/2023-12-12/primary_energy_consumption + - data://garden/emissions/2024-04-08/national_contributions + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population + - data://garden/climate_watch/2023-10-31/emissions_by_sector + - data://garden/gcp/2023-12-12/global_carbon_budget ###################################################################################################################### # Older versions that should be archived once they are not used by any other steps. diff --git a/dag/fasttrack.yml b/dag/fasttrack.yml index 7b0596c8741..3637725ec67 100644 --- a/dag/fasttrack.yml +++ b/dag/fasttrack.yml @@ -154,3 +154,9 @@ steps: - snapshot://fasttrack/latest/usa_weather_climate_noaa.csv data://grapher/fasttrack/latest/global_precipitation_anomaly_noaa: - snapshot://fasttrack/latest/global_precipitation_anomaly_noaa.csv + data://grapher/fasttrack/latest/gpei: + - snapshot://fasttrack/latest/gpei.csv + data-private://grapher/fasttrack/latest/conflict_deaths_combined: + - snapshot-private://fasttrack/latest/conflict_deaths_combined.csv + data://grapher/fasttrack/2024-04-17/qubits: + - snapshot://fasttrack/2024-04-17/qubits.csv diff --git a/dag/poverty_inequality.yml b/dag/poverty_inequality.yml index f4cf015baa2..d156dd48257 100644 --- a/dag/poverty_inequality.yml +++ b/dag/poverty_inequality.yml @@ -7,24 +7,24 @@ steps: # Poverty and inequality file for Joe's PhD data://explorers/poverty_inequality/latest/poverty_inequality_export: - - data://garden/wb/2024-01-17/world_bank_pip + - data://garden/wb/2024-03-27/world_bank_pip - data://garden/wid/2023-08-24/world_inequality_database - data://garden/lis/2023-08-30/luxembourg_income_study - data://garden/wb/2024-01-22/thousand_bins_distribution - data://garden/worldbank_wdi/2023-05-29/wdi # World Bank Poverty and Inequality Platform - data://meadow/wb/2024-01-17/world_bank_pip: - - snapshot://wb/2024-01-17/world_bank_pip.csv - - snapshot://wb/2024-01-17/world_bank_pip_percentiles.csv - data://garden/wb/2024-01-17/world_bank_pip: - - data://meadow/wb/2024-01-17/world_bank_pip - data://grapher/wb/2024-01-17/world_bank_pip_2011ppp: - - data://garden/wb/2024-01-17/world_bank_pip - data://grapher/wb/2024-01-17/world_bank_pip_2017ppp: - - data://garden/wb/2024-01-17/world_bank_pip + data://meadow/wb/2024-03-27/world_bank_pip: + - snapshot://wb/2024-03-27/world_bank_pip.csv + - snapshot://wb/2024-03-27/world_bank_pip_percentiles.csv + data://garden/wb/2024-03-27/world_bank_pip: + - data://meadow/wb/2024-03-27/world_bank_pip + data://grapher/wb/2024-03-27/world_bank_pip_2011ppp: + - data://garden/wb/2024-03-27/world_bank_pip + data://grapher/wb/2024-03-27/world_bank_pip_2017ppp: + - data://garden/wb/2024-03-27/world_bank_pip data://explorers/wb/latest/world_bank_pip: - - data://garden/wb/2024-01-17/world_bank_pip + - data://garden/wb/2024-03-27/world_bank_pip # World Inequality Database data://meadow/wid/2023-08-24/world_inequality_database: @@ -66,13 +66,13 @@ steps: data://grapher/ophi/2023-07-05/multidimensional_poverty_index: - data://garden/ophi/2023-07-05/multidimensional_poverty_index - # OECD Income Distribution Database - data://meadow/oecd/2023-06-06/income_distribution_database: - - snapshot://oecd/2023-06-06/income_distribution_database.csv - data://garden/oecd/2023-06-06/income_distribution_database: - - data://meadow/oecd/2023-06-06/income_distribution_database - data://grapher/oecd/2023-06-06/income_distribution_database: - - data://garden/oecd/2023-06-06/income_distribution_database + # # OECD Income Distribution Database + data://meadow/oecd/2024-04-10/income_distribution_database: + - snapshot://oecd/2024-04-10/income_distribution_database.csv + data://garden/oecd/2024-04-10/income_distribution_database: + - data://meadow/oecd/2024-04-10/income_distribution_database + data://grapher/oecd/2024-04-10/income_distribution_database: + - data://garden/oecd/2024-04-10/income_distribution_database # Historical poverty data - Moatsos (2021) data://meadow/moatsos/2023-10-09/moatsos_historical_poverty: diff --git a/etl/chart_revision/v1/deprecated.py b/etl/chart_revision/v1/deprecated.py index fc47931c239..f4159b7f427 100644 --- a/etl/chart_revision/v1/deprecated.py +++ b/etl/chart_revision/v1/deprecated.py @@ -21,7 +21,7 @@ from tqdm import tqdm from etl.config import DEBUG, GRAPHER_USER_ID -from etl.db import open_db +from etl.db import get_engine from etl.grapher_helpers import IntRange log = structlog.get_logger() @@ -179,23 +179,23 @@ def _get_chart_update_reason(self, variable_ids: List[int]) -> str: Accesses DB and finds out the name of the recently added dataset with the new variables.""" try: - with open_db() as db: + with get_engine().connect() as con: if len(variable_ids) == 1: - results = db.fetch_many( + results = con.execute( f""" SELECT variables.name, datasets.name, datasets.version FROM datasets JOIN variables ON datasets.id = variables.datasetId WHERE variables.id IN ({variable_ids[0]}) """ - ) + ).fetchmany() else: - results = db.fetch_many( + results = con.execute( f""" SELECT variables.name, datasets.name, datasets.version FROM datasets JOIN variables ON datasets.id = variables.datasetId WHERE variables.id IN {*variable_ids,} """ - ) + ).fetchmany() except Exception: self.report_error( "Problem found when accessing the DB trying to get details on the newly added variables" @@ -220,10 +220,10 @@ def _get_chart_update_reason(self, variable_ids: List[int]) -> str: def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: n_before = 0 try: - with open_db() as db: - n_before = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0] + with get_engine().connect() as con: + n_before = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0] # type: ignore - res = db.fetch_many( + res = con.execute( """ SELECT * FROM ( @@ -235,7 +235,7 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: ) as grouped WHERE grouped.c > 1 """ - ) + ).fetchmany() if len(res): raise RuntimeError( "Two or more suggested chart revisions with status IN " @@ -267,13 +267,13 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: VALUES (%s, %s, %s, %s, %s, %s, NOW(), NOW()) """ - db.upsert_many(query, tuples) + con.execute(query, tuples) # checks if any of the affected chartIds now has multiple # pending suggested revisions. If so, then rejects the whole # insert and tell the user which suggested chart revisions need # to be approved/rejected. - res = db.fetch_many( + res = con.execute( f""" SELECT id, scr.chartId, c, createdAt FROM ( @@ -291,7 +291,7 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: WHERE grouped.c > 1 ORDER BY createdAt ASC """ - ) + ).fetchmany() if len(res): df = pd.DataFrame(res, columns=["id", "chart_id", "count", "created_at"]) df["drop"] = df.groupby("chart_id")["created_at"].transform(lambda gp: gp == gp.max()) @@ -321,8 +321,8 @@ def insert(self, suggested_chart_revisions: List[dict[str, Any]]) -> None: self.report_error(f"INSERT operation into `suggested_chart_revisions` cancelled. Error: {e}") raise e finally: - with open_db() as db: - n_after = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0] + with get_engine().connect() as con: + n_after = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0] # type: ignore self.report_info( f"{n_after - n_before} of {len(suggested_chart_revisions)} suggested chart revisions inserted." @@ -343,18 +343,18 @@ def _get_charts_from_old_variables( df_chart_dimensions: dataframe of chart_dimensions rows. df_chart_revisions: dataframe of chart_revisions rows. """ - with open_db() as db: + with get_engine().connect() as con: # retrieves chart_dimensions variable_ids = list(self.old_var_id2new_var_id.keys()) variable_ids_str = ",".join([str(_id) for _id in variable_ids]) columns = ["id", "chartId", "variableId", "property", "order"] - rows = db.fetch_many( + rows = con.execute( f""" SELECT {','.join([f'`{col}`' for col in columns])} FROM chart_dimensions WHERE variableId IN ({variable_ids_str}) """ - ) + ).fetchmany() df_chart_dimensions = pd.DataFrame(rows, columns=columns) # retrieves charts @@ -369,40 +369,40 @@ def _get_charts_from_old_variables( "lastEditedAt", "publishedAt", ] - rows = db.fetch_many( + rows = con.execute( f""" SELECT {','.join(columns)} FROM charts WHERE id IN ({chart_ids_str}) """ - ) + ).fetchmany() df_charts = pd.DataFrame(rows, columns=columns) # retrieves chart_revisions columns = ["id", "chartId", "userId", "config", "createdAt", "updatedAt"] - rows = db.fetch_many( + rows = con.execute( f""" SELECT {','.join(columns)} FROM chart_revisions WHERE chartId IN ({chart_ids_str}) """ - ) + ).fetchmany() df_chart_revisions = pd.DataFrame(rows, columns=columns) return df_charts, df_chart_dimensions, df_chart_revisions def _get_variable_year_ranges(self) -> Dict[int, List[int]]: - with open_db() as db: + with get_engine().connect() as con: all_var_ids = list(self.old_var_id2new_var_id.keys()) + list(self.old_var_id2new_var_id.values()) variable_ids_str = ",".join([str(_id) for _id in all_var_ids]) raise NotImplementedError("data_values was deprecated") - rows = db.fetch_many( + rows = con.execute( f""" SELECT variableId, MIN(year) AS minYear, MAX(year) AS maxYear FROM data_values WHERE variableId IN ({variable_ids_str}) GROUP BY variableId """ - ) + ).fetchmany() var_id2year_range = {} for variable_id, min_year, max_year in rows: var_id2year_range[variable_id] = [min_year, max_year] diff --git a/etl/chart_revision/v1/revision.py b/etl/chart_revision/v1/revision.py index 1cbe360c409..0346de1b9ee 100644 --- a/etl/chart_revision/v1/revision.py +++ b/etl/chart_revision/v1/revision.py @@ -15,7 +15,7 @@ from etl.chart_revision.v1.chart import Chart from etl.chart_revision.v1.variables import VariablesUpdate from etl.config import GRAPHER_USER_ID -from etl.db import get_engine, open_db +from etl.db import get_engine log = get_logger() # The maximum length of the suggested revision reason can't exceed the maximum length specified by the datatype "suggestedReason" in grapher.suggested_chart_revisions table. @@ -341,10 +341,10 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): """Submit chart revisions to Grapher.""" n_before = 0 try: - with open_db() as db: - n_before = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0] + with get_engine().connect() as con: + n_before = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0] # type: ignore - res = db.fetch_many( + res = con.execute( """ SELECT * FROM ( @@ -356,7 +356,7 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): ) as grouped WHERE grouped.c > 1 """ - ) + ).fetchmany() if len(res): raise RuntimeError( "Two or more suggested chart revisions with status IN " @@ -387,13 +387,13 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): VALUES (%s, %s, %s, %s, %s, %s, %s, NOW(), NOW()) """ - db.upsert_many(query, tuples) + con.execute(query, tuples) # checks if any of the affected chartIds now has multiple # pending suggested revisions. If so, then rejects the whole # insert and tell the user which suggested chart revisions need # to be approved/rejected. - res = db.fetch_many( + res = con.execute( f""" SELECT id, scr.chartId, c, createdAt FROM ( @@ -411,7 +411,7 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): WHERE grouped.c > 1 ORDER BY createdAt ASC """ - ) + ).fetchmany() if len(res): df = pd.DataFrame(res, columns=["id", "chart_id", "count", "created_at"]) df["drop"] = df.groupby("chart_id")["created_at"].transform(lambda gp: gp == gp.max()) @@ -441,8 +441,8 @@ def submit_revisions_to_grapher(revisions: List[ChartVariableUpdateRevision]): log.info(f"INSERT operation into `suggested_chart_revisions` cancelled. Error: {e}") raise e finally: - with open_db() as db: - n_after = db.fetch_one("SELECT COUNT(id) FROM suggested_chart_revisions")[0] + with get_engine().connect() as con: + n_after = con.execute("SELECT COUNT(id) FROM suggested_chart_revisions").fetchone()[0] # type: ignore log.info(f"{n_after - n_before} of {len(revisions)} suggested chart revisions inserted.") @@ -452,23 +452,23 @@ def _get_chart_update_reason(variable_ids: List[int]) -> str: Accesses DB and finds out the name of the recently added dataset with the new variables.""" try: - with open_db() as db: + with get_engine().connect() as con: if len(variable_ids) == 1: - results = db.fetch_many( + results = con.execute( f""" SELECT variables.name, datasets.name, datasets.version FROM datasets JOIN variables ON datasets.id = variables.datasetId WHERE variables.id IN ({variable_ids[0]}) """ - ) + ).fetchmany() else: - results = db.fetch_many( + results = con.execute( f""" SELECT variables.name, datasets.name, datasets.version FROM datasets JOIN variables ON datasets.id = variables.datasetId WHERE variables.id IN {*variable_ids,} """ - ) + ).fetchmany() except Exception: log.error( "Problem found when accessing the DB trying to get details on the newly added variables" diff --git a/etl/command.py b/etl/command.py index 62a8f005b17..5aa7d184ecc 100644 --- a/etl/command.py +++ b/etl/command.py @@ -5,13 +5,17 @@ import difflib import itertools +import json import re import resource import sys import time +from collections.abc import MutableMapping from concurrent.futures import FIRST_COMPLETED, Future, ProcessPoolExecutor, ThreadPoolExecutor, wait from contextlib import contextmanager +from functools import partial from graphlib import TopologicalSorter +from multiprocessing import Manager from os import environ from pathlib import Path from typing import Any, Callable, Dict, Iterator, List, Optional, Set @@ -343,11 +347,18 @@ def run_dag( print("--- All datasets up to date!") return + # Calculate total expected time for all steps (if run sequentially) + total_expected_time_seconds = sum(_get_execution_time(str(step)) or 0 for step in steps) + if dry_run: - print(f"--- Running {len(steps)} steps:") + print( + f"--- Would run {len(steps)} steps{_create_expected_time_message(total_expected_time_seconds, prepend_message=' (at least ')}:" + ) return enumerate_steps(steps) elif workers == 1: - print(f"--- Running {len(steps)} steps:") + print( + f"--- Running {len(steps)} steps{_create_expected_time_message(total_expected_time_seconds, prepend_message=' (at least ')}:" + ) return exec_steps(steps, strict=strict) else: print(f"--- Running {len(steps)} steps with {workers} processes:") @@ -355,14 +366,24 @@ def run_dag( def exec_steps(steps: List[Step], strict: Optional[bool] = None) -> None: + execution_times = {} for i, step in enumerate(steps, 1): - print(f"--- {i}. {step}...") + print(f"--- {i}. {step}{_create_expected_time_message(_get_execution_time(step_name=str(step)))}") + + # Determine strictness level for the current step strict = _detect_strictness_level(step, strict) + with strictness_level(strict): + # Execute the step and measure the time taken time_taken = timed_run(lambda: step.run()) - click.echo(f"{click.style('OK', fg='blue')} ({time_taken:.1f}s)") + execution_times[str(step)] = time_taken + + click.echo(f"{click.style('OK', fg='blue')}{_create_expected_time_message(time_taken)}") print() + # Write the recorded execution times to the file after all steps have been executed + _write_execution_times(execution_times) + def _steps_sort_key(step: Step) -> int: """Sort steps by channel, so that grapher steps are executed first, then garden, then meadow, then snapshots.""" @@ -384,16 +405,27 @@ def exec_steps_parallel(steps: List[Step], workers: int, dag: DAG, strict: Optio # the load on MySQL steps = sorted(steps, key=_steps_sort_key) - # create execution graph from steps - exec_graph = {} - steps_str = {str(step) for step in steps} - for step in steps: - # only add dependencies that are in the list of steps (i.e. are dirty) - # NOTE: we have to compare their string versions, the actual objects might have - # different attributes - exec_graph[str(step)] = {str(dep) for dep in step.dependencies if str(dep) in steps_str} + # Use a Manager dict to collect execution times in parallel execution + with Manager() as manager: + execution_times = manager.dict() + + # Create execution graph from steps + exec_graph = {} + steps_str = {str(step) for step in steps} + for step in steps: + # only add dependencies that are in the list of steps (i.e. are dirty) + # NOTE: we have to compare their string versions, the actual objects might have + # different attributes + exec_graph[str(step)] = {str(dep) for dep in step.dependencies if str(dep) in steps_str} - exec_graph_parallel(exec_graph, _exec_step_job, workers, dag=dag, strict=strict) + # Prepare a function for execution that includes the necessary arguments + exec_func = partial(_exec_step_job, execution_times=execution_times, dag=dag, strict=strict) + + # Execute the graph of tasks in parallel + exec_graph_parallel(exec_graph, exec_func, workers) + + # After all tasks have completed, write the execution times to the file + _write_execution_times(dict(execution_times)) def exec_graph_parallel( @@ -417,12 +449,22 @@ def exec_graph_parallel( # Dictionary to keep track of future tasks future_to_task: Dict[Future, str] = {} + ready_tasks = [] + while topological_sorter.is_active(): + # add new tasks + ready_tasks += topological_sorter.get_ready() + # Submit tasks that are ready to the executor - for task in topological_sorter.get_ready(): + # NOTE: limit it to `workers`, otherwise it might accept tasks that are not CPU bound + # and overload our DB + for task in ready_tasks[:workers]: future = executor.submit(func, task, **kwargs) future_to_task[future] = task + # remove ready tasks + ready_tasks = ready_tasks[workers:] + # Wait for at least one future to complete done, _ = wait(future_to_task.keys(), return_when=FIRST_COMPLETED) @@ -433,7 +475,24 @@ def exec_graph_parallel( topological_sorter.done(task) -def _exec_step_job(step_name: str, dag: Optional[DAG] = None, strict: Optional[bool] = None) -> None: +def _create_expected_time_message( + expected_time: Optional[float], prepend_message: str = " (", append_message: str = ")" +) -> str: + minutes, seconds = divmod(expected_time or 0, 60) + if minutes < 1: + partial_message = f"{seconds:.1f}s" + else: + partial_message = f"{int(minutes)}m{seconds: .1f}s" + + if (expected_time is None) or (expected_time == 0): + return "" + else: + return prepend_message + partial_message + append_message + + +def _exec_step_job( + step_name: str, execution_times: MutableMapping, dag: Optional[DAG] = None, strict: Optional[bool] = None +) -> None: """ Executes a step. @@ -441,19 +500,52 @@ def _exec_step_job(step_name: str, dag: Optional[DAG] = None, strict: Optional[b :param dag: The original DAG used to create Step object. This must be the same DAG as given to ETL. :param strict: The strictness level for the step execution. """ - print(f"--- Starting {step_name}", flush=True) + print(f"--- Starting {step_name}{_create_expected_time_message(_get_execution_time(step_name))}") assert dag step = parse_step(step_name, dag) strict = _detect_strictness_level(step, strict) with strictness_level(strict): - time_taken = timed_run(lambda: step.run()) + execution_times[step_name] = timed_run(lambda: step.run()) + print(f"--- Finished {step_name} ({execution_times[step_name]:.1f}s)") + - print(f"--- Finished {step_name} ({time_taken:.0f}s)", flush=True) +def _write_execution_times(execution_times: Dict) -> None: + # Write the recorded execution times to a hidden json file that contains the time it took to execute each step + execution_time_file = paths.EXECUTION_TIME_FILE + if execution_time_file.exists(): + with open(execution_time_file, "r") as file: + stored_times = json.load(file) + else: + stored_times = {} + + stored_times.update(execution_times) + with open(execution_time_file, "w") as file: + json.dump(stored_times, file, indent=4, sort_keys=True) + + +def _get_step_identifier(step_name: str) -> str: + return step_name.replace(step_name.split("/")[-2] + "/", "") + + +def _get_execution_time(step_name: str) -> Optional[float]: + # Read execution time of a given step from the hidden json file + # If it doesn't exist, try to read another version of the same step, and if no other version exists, return None + if not paths.EXECUTION_TIME_FILE.exists(): + return None + else: + with open(paths.EXECUTION_TIME_FILE, "r") as file: + execution_times = json.load(file) + execution_time = execution_times.get(step_name) + if not execution_time: + # If the step has not been timed yet, try to find a previous version + step_identifiers = {_get_step_identifier(step): value for step, value in execution_times.items()} + execution_time = step_identifiers.get(_get_step_identifier(step_name)) + return execution_time def enumerate_steps(steps: List[Step]) -> None: for i, step in enumerate(steps, 1): - print(f"{i}. {step}") + print(f"{i}. {step}{_create_expected_time_message(_get_execution_time(str(step)))}") def _detect_strictness_level(step: Step, strict: Optional[bool] = None) -> bool: diff --git a/etl/compare.py b/etl/compare.py index 527ae17601f..a690224f3b3 100644 --- a/etl/compare.py +++ b/etl/compare.py @@ -17,7 +17,7 @@ from apps.backport.datasync.data_metadata import variable_data_df_from_s3 from etl import tempcompare -from etl.db import get_engine +from etl.db import get_engine, read_sql @click.group(name="compare", cls=RichGroup) @@ -293,11 +293,7 @@ def read_dataset_from_db(env_path: str, namespace: str, version: str, dataset: s WHERE version = %(version)s and namespace = %(namespace)s and shortName = %(dataset)s """ - df = pd.read_sql( - q, - engine, - params={"version": version, "namespace": namespace, "dataset": dataset}, - ) + df = read_sql(q, engine, params={"version": version, "namespace": namespace, "dataset": dataset}) # drop uninteresting columns df = df.drop(["createdByUserId", "dataEditedAt", "metadataEditedAt", "updatedAt"], axis=1) @@ -316,7 +312,7 @@ def read_variables_from_db(env_path: str, namespace: str, version: str, dataset: WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s """ - df = pd.read_sql( + df = read_sql( q, engine, params={"version": version, "namespace": namespace, "dataset": dataset}, @@ -341,7 +337,7 @@ def read_sources_from_db(env_path: str, namespace: str, version: str, dataset: s WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s """ - df = pd.read_sql( + df = read_sql( q, engine, params={"version": version, "namespace": namespace, "dataset": dataset}, @@ -365,7 +361,7 @@ def read_values_from_s3(env_path: str, namespace: str, version: str, dataset: st JOIN datasets as d ON v.datasetId = d.id WHERE d.version = %(version)s and d.namespace = %(namespace)s and d.shortName = %(dataset)s """ - vf = pd.read_sql( + vf = read_sql( q, engine, params={"version": version, "namespace": namespace, "dataset": dataset}, diff --git a/etl/config.py b/etl/config.py index 528ba5adc15..d73719b985c 100644 --- a/etl/config.py +++ b/etl/config.py @@ -12,6 +12,7 @@ from os import environ as env import bugsnag +import pandas as pd from dotenv import load_dotenv from etl.paths import BASE_DIR @@ -30,6 +31,10 @@ def load_env(): load_env() + + +pd.set_option("future.no_silent_downcasting", True) + # When DEBUG is on # - run steps in the same process (speeding up ETL) DEBUG = env.get("DEBUG") in ("True", "true", "1") @@ -131,7 +136,7 @@ def variable_metadata_url(variable_id): MAX_VIRTUAL_MEMORY_LINUX = 32 * 2**30 # 32 GB # increment this to force a full rebuild of all datasets -ETL_EPOCH = 4 +ETL_EPOCH = 5 # any garden or grapher dataset after this date will have strict mode enabled STRICT_AFTER = "2023-06-25" @@ -150,6 +155,8 @@ def variable_metadata_url(variable_id): OPENAI_API_KEY = env.get("OPENAI_API_KEY", None) +OWIDBOT_ACCESS_TOKEN = env.get("OWIDBOT_ACCESS_TOKEN", None) + def enable_bugsnag() -> None: if BUGSNAG_API_KEY: diff --git a/etl/data_helpers/population.py b/etl/data_helpers/population.py index d768aab7fa9..6b150b4033c 100644 --- a/etl/data_helpers/population.py +++ b/etl/data_helpers/population.py @@ -111,7 +111,7 @@ def add_population( # Build age groups df_pop = [] - pop["age"] = pop["age"].replace({"100+": 100}).astype("uint") + pop["age"] = pop["age"].astype(str).replace({"100+": 100}).astype("uint") for age_group_name, age_ranges in age_group_mapping.items(): if not age_ranges: age_ranges = [None, None] diff --git a/etl/datadiff.py b/etl/datadiff.py index 3fa6dc26d33..9820496d40f 100644 --- a/etl/datadiff.py +++ b/etl/datadiff.py @@ -1,9 +1,10 @@ import difflib import os import re +import traceback from concurrent.futures import ThreadPoolExecutor from pathlib import Path -from typing import Any, Callable, Dict, Iterable, List, Optional, cast +from typing import Any, Callable, Dict, Iterable, List, Optional, Union, cast import numpy as np import pandas as pd @@ -11,7 +12,7 @@ import rich import rich_click as click import structlog -from owid.catalog import Dataset, DatasetMeta, LocalCatalog, RemoteCatalog, Table, find +from owid.catalog import Dataset, DatasetMeta, LocalCatalog, RemoteCatalog, Table, VariableMeta, find from owid.catalog.catalogs import CHANNEL, OWID_CATALOG_URI from rich.console import Console from rich.panel import Panel @@ -111,16 +112,33 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str): for col in ds_b[table_name].columns: self.p(f"\t\t[green]+ Column [b]{col}[/b]") else: - table_a = ds_a[table_name] - table_b = ds_b[table_name] + # get both tables in parallel + with ThreadPoolExecutor() as executor: + future_a = executor.submit(ds_a.__getitem__, table_name) + future_b = executor.submit(ds_b.__getitem__, table_name) + + table_a = future_a.result() + table_b = future_b.result() # set default index for datasets that don't have one if table_a.index.names == [None] and table_b.index.names == [None]: candidates = {"entity", "date", "country", "year"} - new_index = list(candidates & set(table_a.columns) & set(table_b.columns)) - if new_index: - table_a = table_a.set_index(new_index) - table_b = table_b.set_index(new_index) + new_index_cols = list(candidates & set(table_a.columns) & set(table_b.columns)) + if new_index_cols: + table_a = table_a.set_index(new_index_cols) + table_b = table_b.set_index(new_index_cols) + + # if using default index, it is possible that we have non-determinstic order + # try sorting by the first two columns + if ( + table_a.index.names == [None] + and table_b.index.names == [None] + and len(table_a) == len(table_b) + and table_a.index[-1] == len(table_a) - 1 + and len(table_a) <= 1000 + ): + table_a = table_a.sort_values(list(table_a.columns)).reset_index(drop=True) + table_b = table_b.sort_values(list(table_b.columns)).reset_index(drop=True) # indexes differ, reset them to make them somehow comparable if table_a.index.names != table_b.index.names: @@ -131,21 +149,19 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str): # only sort index if different to avoid unnecessary sorting for huge datasets such as ghe if len(table_a) != len(table_b) or not _index_equals(table_a, table_b): - index_diff = True - table_a, table_b, eq_index = _align_tables(table_a, table_b) - - # if only index order has changed, don't report it - if eq_index.all(): - index_diff = False + table_a, table_b, eq_index, new_index, removed_index = _align_tables(table_a, table_b) else: - index_diff = False eq_index = pd.Series(True, index=table_a.index) + new_index = pd.Series(False, index=table_a.index) + removed_index = pd.Series(False, index=table_a.index) # resetting index will make comparison easier - dims = table_a.index.names + dims = [dim for dim in table_a.index.names if dim is not None] table_a: Table = table_a.reset_index() table_b: Table = table_b.reset_index() - eq_index = eq_index.reset_index(drop=True) + eq_index = cast(pd.Series, eq_index.reset_index(drop=True)) + new_index = cast(pd.Series, new_index.reset_index(drop=True)) + removed_index = cast(pd.Series, removed_index.reset_index(drop=True)) # compare table metadata diff = _dict_diff(_table_metadata_dict(table_a), _table_metadata_dict(table_b), tabs=3) @@ -157,8 +173,31 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str): else: self.p(f"\t[white]= Table [b]{table_name}[/b]") + # compare index + if not eq_index.all(): + for dim in dims: + if eq_index.all(): + self.p(f"\t\t[white]= Dim [b]{dim}[/b]") + else: + self.p(f"\t\t[yellow]~ Dim [b]{dim}[/b]") + if self.verbose: + dims_without_dim = [d for d in dims if d != dim] + out = _data_diff( + table_a, + table_b, + dim, + dims_without_dim, + eq_index, + eq_index, + new_index, + removed_index, + tabs=4, + ) + if out: + self.p(out) + # compare columns - all_cols = sorted(set(table_a.columns) | set(table_b.columns)) + all_cols = sorted((set(table_a.columns) | set(table_b.columns)) - set(dims)) for col in all_cols: if self.cols and not re.search(self.cols, col): continue @@ -171,31 +210,33 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str): col_a = table_a[col] col_b = table_b[col] + # metadata diff + meta_diff = _dict_diff( + _column_metadata_dict(col_a.metadata), _column_metadata_dict(col_b.metadata), tabs=4 + ) + # equality on index and series eq_data = series_equals(table_a[col], table_b[col]) - data_diff = (~eq_data).any() - eq = eq_index & eq_data - col_a_meta = col_a.metadata.to_dict() - col_b_meta = col_b.metadata.to_dict() - - meta_diff = _dict_diff(col_a_meta, col_b_meta, tabs=4) - - changed = ( - (["data"] if data_diff else []) - + (["metadata"] if meta_diff else []) - + (["index"] if index_diff else []) - ) + changed = [] + if meta_diff: + changed.append("changed [u]metadata[/u]") + if new_index.any(): + changed.append("new [u]data[/u]") + if (~eq_data[~new_index]).any(): + changed.append("changed [u]data[/u]") if changed: - self.p(f"\t\t[yellow]~ Column [b]{col}[/b] (changed [u]{' & '.join(changed)}[/u])") + self.p(f"\t\t[yellow]~ Column [b]{col}[/b] ({', '.join(changed)})") if self.verbose: if meta_diff: - self.p(_dict_diff(col_a_meta, col_b_meta, tabs=4)) - if data_diff or index_diff: + self.p(meta_diff) + if new_index.any() or removed_index.any() or (~eq_data).any(): if meta_diff: self.p("") - out = _data_diff(table_a, table_b, col, dims, tabs=4, eq=eq) + out = _data_diff( + table_a, table_b, col, dims, eq_data, eq_index, new_index, removed_index, tabs=4 + ) if out: self.p(out) else: @@ -279,6 +320,13 @@ def __getitem__(self, name: str) -> Table: is_flag=True, help="Print code snippet for loading both tables, useful for debugging in notebook", ) +@click.option( + "--workers", + "-w", + type=int, + help="Use multiple threads.", + default=1, +) def cli( path_a: str, path_b: str, @@ -288,11 +336,14 @@ def cli( exclude: Optional[str], verbose: bool, snippet: bool, + workers: int, ) -> None: """Compare all datasets from two catalogs and print out a summary of their differences. Compare all the datasets from catalog in `PATH_A` with all the datasets in catalog `PATH_B`. The catalog paths link to the `data/` folder with all the datasets (it contains a `catalog.meta.json` file) + You can also use a path to a dataset. + Note that you can use the keyword "REMOTE" as the path, if you want to run a comparison with the remote catalog. This tool is useful as a quick way to see what has changed in the catalog and whether our updates don't have any unexpected side effects. @@ -315,19 +366,29 @@ def cli( $ etl diff other-data/ data/ --include maddison ``` """ - console = Console(tab_size=2) + console = Console(tab_size=2, soft_wrap=True) path_to_ds_a = _load_catalog_datasets(path_a, channel, include, exclude) path_to_ds_b = _load_catalog_datasets(path_b, channel, include, exclude) - # only keep datasets in DAG + # only keep datasets in DAG, unless there's only one dataset selected by precise path dag_steps = {s.split("://")[1] for s in load_dag().keys()} - path_to_ds_a = {k: v for k, v in path_to_ds_a.items() if k in dag_steps} - path_to_ds_b = {k: v for k, v in path_to_ds_b.items() if k in dag_steps} + if len(path_to_ds_a) > 1: + path_to_ds_a = {k: v for k, v in path_to_ds_a.items() if k in dag_steps} + if len(path_to_ds_b) > 1: + path_to_ds_b = {k: v for k, v in path_to_ds_b.items() if k in dag_steps} + + if not path_to_ds_a: + console.print(f"[yellow]❓ No datasets found in {path_a}[/yellow]") + exit(0) + if not path_to_ds_b: + console.print(f"[yellow]❓ No datasets found in {path_b}[/yellow]") + exit(0) any_diff = False any_error = False + matched_datasets = [] for path in sorted(set(path_to_ds_a.keys()) | set(path_to_ds_b.keys())): ds_a = _match_dataset(path_to_ds_a, path) ds_b = _match_dataset(path_to_ds_b, path) @@ -337,27 +398,65 @@ def cli( # to improve performance. Source checksum should be enough continue - lines = [] + matched_datasets.append((ds_a, ds_b)) - def _append_and_print(x): - lines.append(x) - console.print(x) + if workers > 1: + futures = [] - try: - differ = DatasetDiff(ds_a, ds_b, cols=cols, print=_append_and_print, verbose=verbose, snippet=snippet) - differ.summary() - except DatasetError as e: - # soft fail and continue with another dataset - _append_and_print(f"[bold red]⚠ Error: {e}[/bold red]") - continue - except Exception as e: - # soft fail and continue with another dataset - log.error(e, exc_info=True) - any_error = True - continue + with ThreadPoolExecutor(max_workers=workers) as executor: + for ds_a, ds_b in matched_datasets: - if any("~" in line for line in lines if isinstance(line, str)): - any_diff = True + def func(ds_a, ds_b): + lines = [] + differ = DatasetDiff( + ds_a, ds_b, cols=cols, print=lambda x: lines.append(x), verbose=verbose, snippet=snippet + ) + differ.summary() + return lines + + futures.append(executor.submit(func, ds_a, ds_b)) + + for future in futures: + try: + lines = future.result() + except DatasetError as e: + # soft fail and continue with another dataset + lines = [f"[bold red]⚠ Error: {e}[/bold red]"] + except Exception as e: + # soft fail and continue with another dataset + log.error("\n".join(traceback.format_exception(type(e), e, e.__traceback__))) + any_error = True + lines = [] + continue + + for line in lines: + console.print(line) + + if "~" in line: + any_diff = True + else: + for ds_a, ds_b in matched_datasets: + lines = [] + + def _append_and_print(x): + lines.append(x) + console.print(x) + + try: + differ = DatasetDiff(ds_a, ds_b, cols=cols, print=_append_and_print, verbose=verbose, snippet=snippet) + differ.summary() + except DatasetError as e: + # soft fail and continue with another dataset + _append_and_print(f"[bold red]⚠ Error: {e}[/bold red]") + continue + except Exception as e: + # soft fail and continue with another dataset + log.error("\n".join(traceback.format_exception(type(e), e, e.__traceback__))) + any_error = True + continue + + if any("~" in line for line in lines if isinstance(line, str)): + any_diff = True console.print() if not path_to_ds_a and not path_to_ds_b: @@ -388,8 +487,8 @@ def _index_equals(table_a: pd.DataFrame, table_b: pd.DataFrame, sample: int = 10 index_a = table_a.index index_b = table_b.index else: - index_a = table_a.sample(sample, random_state=0).index - index_b = table_b.sample(sample, random_state=0).index + index_a = table_a.sample(sample, random_state=0, replace=True).index + index_b = table_b.sample(sample, random_state=0, replace=True).index return index_a.equals(index_b) @@ -413,23 +512,82 @@ def _dict_diff(dict_a: Dict[str, Any], dict_b: Dict[str, Any], tabs: int = 0, ** return "\t" * tabs + "".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip() +def _df_to_str(df: pd.DataFrame, limit: int = 5) -> list[str]: + lines = [] + if len(df) > limit: + df_samp = df.sample(limit, random_state=0).sort_index() + else: + df_samp = df + + for line in df_samp.to_string(index=False).split("\n"): # type: ignore + lines.append(" " + line) + return lines + + def _data_diff( - table_a: Table, table_b: Table, col: str, dims: list[str], tabs: int, eq: Optional[pd.Series] = None + table_a: Table, + table_b: Table, + col: str, + dims: list[str], + eq_data: pd.Series, + eq_index: pd.Series, + new_index: pd.Series, + removed_index: pd.Series, + tabs: int = 0, ) -> str: """Return summary of data differences.""" - if eq is None: - eq = series_equals(table_a[col], table_b[col]) + # eq = eq_data & eq_index + n = (eq_index | new_index).sum() - lines = [ - f"- Changed values: {(~eq).sum()} / {len(eq)} ({(~eq).sum() / len(eq) * 100:.2f}%)", - ] + lines = [] + cols = [d for d in dims if d is not None] + [col] + + # new values + if new_index.any(): + lines.append( + f"+ New values: {new_index.sum()} / {n} ({new_index.sum() / n * 100:.2f}%)", + ) + lines += _df_to_str(table_b.loc[new_index, cols]) + + # removed values + if removed_index.any(): + lines.append( + f"- Removed values: {removed_index.sum()} / {n} ({removed_index.sum() / n * 100:.2f}%)", + ) + lines += _df_to_str(table_a.loc[removed_index, cols]) + + # changed values + neq = ~eq_data & eq_index + if neq.any(): + lines.append( + f"~ Changed values: {neq.sum()} / {n} ({neq.sum() / n * 100:.2f}%)", + ) + samp_a = table_a.loc[neq, cols] + samp_b = table_b.loc[neq, cols] + both = samp_a.merge(samp_b, on=dims, suffixes=(" -", " +")) + lines += _df_to_str(both) + + # add color + lines = ["[violet]" + line for line in lines] + + if not lines: + return "" + else: + # add tabs + return "\t" * tabs + "\n".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip() + + """OLD CODE, PARTS OF IT COULD BE STILL USEFUL # changes in index for dim in dims: if dim is not None: diff_elements = table_a.loc[~eq, dim].dropna().astype(str).sort_values().unique().tolist() detail = f"{len(diff_elements)} affected" if len(diff_elements) > 5 else ", ".join(diff_elements) - lines.append(f"- {dim}: {detail}") + lines.append(f"- Dim `{dim}`: {detail}") + + lines.append( + f"- Changed values: {(~eq).sum()} / {len(eq)} ({(~eq).sum() / len(eq) * 100:.2f}%)", + ) # changes in values if ( @@ -452,15 +610,7 @@ def _data_diff( rel_diff = abs_diff / mean if not pd.isnull(mean) and mean != 0 else np.nan lines.append(f"- Avg. change: {abs_diff:.2f} ({rel_diff:.0%})") - - # add color - lines = ["[violet]" + line for line in lines] - - if not lines: - return "" - else: - # add tabs - return "\t" * tabs + "\n".join(lines).replace("\n", "\n" + "\t" * tabs).rstrip() + """ def _is_datetime(dtype: Any) -> bool: @@ -470,7 +620,7 @@ def _is_datetime(dtype: Any) -> bool: return False -def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Series]: +def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Series, pd.Series, pd.Series]: if not table_a.index.is_unique or not table_b.index.is_unique: raise DatasetError("Index must be unique.") @@ -488,11 +638,14 @@ def _align_tables(table_a: Table, table_b: Table) -> tuple[Table, Table, pd.Seri table_b["_x"] = 1 table_a, table_b = table_a.align(table_b, join="outer", copy=False) - eq_index = table_a["_x"].notnull() & table_b["_x"].notnull() + new_index = table_a["_x"].isnull() + removed_index = table_b["_x"].isnull() + + eq_index = ~(new_index | removed_index) table_a.drop(columns="_x", inplace=True) table_b.drop(columns="_x", inplace=True) - return cast(Table, table_a), cast(Table, table_b), eq_index + return cast(Table, table_a), cast(Table, table_b), eq_index, new_index, removed_index def _sort_index(df: Table) -> Table: @@ -554,10 +707,20 @@ def _table_metadata_dict(tab: Table) -> Dict[str, Any]: # for col in tab.columns: # d["columns"][col] = tab[col].metadata.to_dict() + # sort primary key + if "primary_key" in d: + d["primary_key"] = sorted(d["primary_key"]) + del d["dataset"] return d +def _column_metadata_dict(meta: VariableMeta) -> Dict[str, Any]: + d = meta.to_dict() + d.pop("processing_log", None) + return d + + def _dataset_metadata_dict(ds: Dataset) -> Dict[str, Any]: """Extract metadata from Dataset object, prune and and return it as a dictionary""" d = ds.metadata.to_dict() @@ -571,10 +734,21 @@ def _dataset_metadata_dict(ds: Dataset) -> Dict[str, Any]: def _local_catalog_datasets( - catalog_path: str, channels: Iterable[CHANNEL], include: Optional[str], exclude: Optional[str] + catalog_path: Union[str, Path], channels: Iterable[CHANNEL], include: Optional[str], exclude: Optional[str] ) -> Dict[str, Dataset]: """Return a mapping from dataset path to Dataset object of local catalog.""" - lc_a = LocalCatalog(catalog_path, channels=channels) + catalog_path = Path(catalog_path) + catalog_dir = catalog_path + + # it is possible to use subset of a data catalog + while not (catalog_dir / "catalog.meta.json").exists() and catalog_dir != catalog_dir.parent: + catalog_dir = catalog_dir.parent + + if catalog_dir != catalog_path: + assert include is None, "Include pattern is not supported for subset of a catalog" + include = str(catalog_path.relative_to(catalog_dir)) + + lc_a = LocalCatalog(catalog_dir, channels=channels) datasets = [] for chan in lc_a.channels: channel_datasets = list(lc_a.iter_datasets(chan, include=include)) @@ -584,8 +758,11 @@ def _local_catalog_datasets( datasets += channel_datasets + # only compare public datasets + datasets = [ds for ds in datasets if ds.is_public] + # keep only relative path of dataset - mapping = {str(Path(ds.path).relative_to(catalog_path)): ds for ds in datasets} + mapping = {str(Path(ds.path).relative_to(catalog_dir)): ds for ds in datasets} if exclude: re_exclude = re.compile(exclude) @@ -619,10 +796,10 @@ def _remote_catalog_datasets(channels: Iterable[CHANNEL], include: str, exclude: ds_paths = frame["ds_paths"] if include: - ds_paths = ds_paths[ds_paths.str.contains(include)] + ds_paths = ds_paths[ds_paths.str.contains(include, regex=True)] if exclude: - ds_paths = ds_paths[~ds_paths.str.contains(exclude)] + ds_paths = ds_paths[~ds_paths.str.contains(exclude, regex=True)] ds_paths = set(ds_paths) diff --git a/etl/db.py b/etl/db.py index e2a7a9b9fa8..6f8c42cc706 100644 --- a/etl/db.py +++ b/etl/db.py @@ -1,7 +1,6 @@ -import traceback +import functools +import os import warnings -from collections.abc import Generator -from contextlib import contextmanager from typing import Any, Dict, List, Optional from urllib.parse import quote @@ -14,7 +13,6 @@ from sqlmodel import Session from etl import config -from etl.db_utils import DBUtils log = structlog.get_logger() @@ -46,9 +44,8 @@ def get_session(**kwargs) -> Session: return Session(get_engine(**kwargs)) -def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine: - cf: Any = dict_to_object(conf) if conf else config - +@functools.cache +def _get_engine_cached(cf: Any, pid: int) -> Engine: return create_engine( f"mysql://{cf.DB_USER}:{quote(cf.DB_PASS)}@{cf.DB_HOST}:{cf.DB_PORT}/{cf.DB_NAME}", pool_size=30, # Increase the pool size to allow higher GRAPHER_WORKERS @@ -56,29 +53,11 @@ def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine: ) -@contextmanager -def open_db() -> Generator[DBUtils, None, None]: - connection = None - cursor = None - try: - connection = get_connection() - connection.autocommit(False) - cursor = connection.cursor() - yield DBUtils(cursor) - connection.commit() - except Exception as e: - log.error(f"Error encountered during import: {e}") - log.error("Rolling back changes...") - if connection: - connection.rollback() - if config.DEBUG: - traceback.print_exc() - raise e - finally: - if cursor: - cursor.close() - if connection: - connection.close() +def get_engine(conf: Optional[Dict[str, Any]] = None) -> Engine: + cf: Any = dict_to_object(conf) if conf else config + # pid in memoization makes sure every process gets its own Engine + pid = os.getpid() + return _get_engine_cached(cf, pid) def get_dataset_id( @@ -459,3 +438,14 @@ def get_info_for_etl_datasets(db_conn: Optional[MySQLdb.Connection] = None) -> p df.loc[df["is_private"], "step"] = df[df["is_private"]]["step"].str.replace("data://", "data-private://") return df + + +def read_sql(sql: str, engine: Optional[Engine] = None, *args, **kwargs) -> pd.DataFrame: + """Wrapper around pd.read_sql that creates a connection and closes it after reading the data. + This adds overhead, so if you need performance, reuse the same connection and cursor. + """ + engine = engine or get_engine() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + with engine.connect() as con: + return pd.read_sql(sql, con.connection, *args, **kwargs) diff --git a/etl/db_utils.py b/etl/db_utils.py deleted file mode 100644 index 35213052f3b..00000000000 --- a/etl/db_utils.py +++ /dev/null @@ -1,142 +0,0 @@ -"""This module was inspired by https://github.com/owid/importers/blob/master/db_utils.py. It is not meant -to be extended, but slowly replaced by etl/grapher_model.py""" - -from typing import Any, Dict, Iterable, List, Optional, Tuple, cast - -import structlog -from MySQLdb import IntegrityError -from MySQLdb.cursors import Cursor -from unidecode import unidecode - -log = structlog.get_logger() - -UNMODIFIED = 0 -INSERT = 1 -UPDATE = 2 - - -def normalize_entity_name(entity_name: str) -> str: - return unidecode(entity_name.strip()) - - -class NotOne(ValueError): - pass - - -class DBUtils: - def __init__(self, cursor: Cursor): - self.cursor = cursor - self.entity_id_by_normalised_name: Dict[str, int] = {} - - def get_entity_cache(self) -> Dict[str, int]: - return self.entity_id_by_normalised_name - - def fetch_one_or_none(self, *args: Any, **kwargs: Any) -> Any: - self.cursor.execute(*args, **kwargs) - rows = self.cursor.fetchall() - if len(rows) > 1: - raise NotOne("Expected 1 or 0 rows but received %d" % (len(rows))) - elif len(rows) == 1: - return rows[0] - else: - return None - - def fetch_one(self, *args: Any, **kwargs: Any) -> Any: - result = self.fetch_one_or_none(*args, **kwargs) - if result is None: - raise NotOne("Expected 1 row but received 0") - else: - return result - - def fetch_many(self, *args: Any, **kwargs: Any) -> List[Any]: - self.cursor.execute(*args, **kwargs) - return cast(List[Any], self.cursor.fetchall()) - - def insert_one(self, *args: Any, **kwargs: Any) -> int: - self.cursor.execute(*args, **kwargs) - return int(self.cursor.lastrowid) - - def upsert_one(self, *args: Any, **kwargs: Any) -> Optional[int]: - self.cursor.execute(*args, **kwargs) - if self.cursor.rowcount == 0: - return UNMODIFIED - if self.cursor.rowcount == 1: - return INSERT - if self.cursor.rowcount == 2: - return UPDATE - return None - - def upsert_many(self, query: str, tuples: Iterable[Tuple[Any, ...]]) -> None: - self.cursor.executemany(query, list(tuples)) - - def execute_until_empty(self, *args: Any, **kwargs: Any) -> None: - first = True - while first or self.cursor.rowcount > 0: - first = False - self.cursor.execute(*args, **kwargs) - - def __get_cached_entity_id(self, name: str) -> Optional[int]: - normalised_name = normalize_entity_name(name) - if normalised_name in self.entity_id_by_normalised_name: - return self.entity_id_by_normalised_name[normalised_name] - else: - return None - - def get_or_create_entity(self, name: str) -> int: - # Serve from cache if available - entity_id = self.__get_cached_entity_id(name) - if entity_id is not None: - return entity_id - # Populate cache from database - self.prefill_entity_cache([name]) - entity_id = self.__get_cached_entity_id(name) - if entity_id is not None: - return entity_id - # If still not in cache, it's a new entity and we have to insert it - else: - try: - self.upsert_one( - """ - INSERT INTO entities - (name, displayName, validated, createdAt, updatedAt) - VALUES - (%s, '', FALSE, NOW(), NOW()) - """, - [name], - ) - except IntegrityError: - # If another process inserted the same entity before us, we can - # safely ignore the error and fetch the ID - pass - - (entity_id,) = self.fetch_one( - """ - SELECT id FROM entities - WHERE name = %s - """, - [name], - ) - # Cache the newly created entity - self.entity_id_by_normalised_name[normalize_entity_name(name)] = entity_id - return cast(int, entity_id) - - def prefill_entity_cache(self, names: List[str]) -> None: - rows = self.fetch_many( - """ - SELECT - name, - id - FROM entities - WHERE - entities.name IN %(country_names)s - ORDER BY entities.id ASC - """, - {"country_names": [normalize_entity_name(x) for x in names]}, - ) - # Merge the two dicts - self.entity_id_by_normalised_name.update( - { - # entityName → entityId - **dict((row[0], row[1]) for row in rows if row[1]), - } - ) diff --git a/etl/grapher_helpers.py b/etl/grapher_helpers.py index a6bc869bc35..04c6467cf47 100644 --- a/etl/grapher_helpers.py +++ b/etl/grapher_helpers.py @@ -1,5 +1,4 @@ import copy -import warnings from copy import deepcopy from dataclasses import dataclass, field, is_dataclass from pathlib import Path @@ -10,11 +9,12 @@ import pandas as pd import structlog from jinja2 import Environment +from MySQLdb import IntegrityError from owid import catalog from owid.catalog.utils import underscore +from sqlalchemy.engine import Engine -from etl.db import get_connection, get_engine -from etl.db_utils import DBUtils +from etl.db import get_engine, read_sql from etl.files import checksum_str log = structlog.get_logger() @@ -301,24 +301,54 @@ def long_to_wide_tables( yield cast(catalog.Table, t) -def _get_entities_from_db(countries: Set[str], by: Literal["name", "code"]) -> Dict[str, int]: +def _get_entities_from_db( + countries: Set[str], by: Literal["name", "code"], engine: Engine | None = None +) -> Dict[str, int]: q = f"select id as entity_id, {by} from entities where {by} in %(names)s" - df = pd.read_sql(q, get_engine(), params={"names": list(countries)}) + df = read_sql(q, engine, params={"names": list(countries)}) return cast(Dict[str, int], df.set_index(by).entity_id.to_dict()) -def _get_and_create_entities_in_db(countries: Set[str]) -> Dict[str, int]: - cursor = get_connection().cursor() - db = DBUtils(cursor) - log.info("Creating entities in DB", countries=countries) - return {name: db.get_or_create_entity(name) for name in countries} +def _get_and_create_entities_in_db(countries: Set[str], engine: Engine | None = None) -> Dict[str, int]: + engine = engine or get_engine() + with engine.connect() as con: + log.info("Creating entities in DB", countries=countries) + out = {} + for name in countries: + try: + con.execute( + """ + INSERT INTO entities + (name, displayName, validated, createdAt, updatedAt) + VALUES + (%(name)s, '', FALSE, NOW(), NOW()) + """, + {"name": name}, + ) + except IntegrityError: + # If another process inserted the same entity before us, we can + # safely ignore the error and fetch the ID + pass + + row = con.execute( + """ + SELECT id FROM entities + WHERE name = %(name)s + """, + {"name": name}, + ).fetchone() + assert row + + out[name] = row[0] + + return out def country_to_entity_id( country: pd.Series, create_entities: bool = False, - errors: Literal["raise", "ignore", "warn"] = "raise", by: Literal["name", "code"] = "name", + engine: Engine | None = None, ) -> pd.Series: """Convert country name to grapher entity_id. Most of countries should be in countries_regions.csv, however some regions could be only in `entities` table in MySQL or doesn't exist at all. @@ -331,7 +361,7 @@ def country_to_entity_id( :param by: use `name` if you use country names, `code` if you use ISO codes """ # fill entities from DB - db_entities = _get_entities_from_db(set(country), by=by) + db_entities = _get_entities_from_db(set(country), by=by, engine=engine) entity_id = country.map(db_entities).astype(float) # create entities in DB @@ -339,21 +369,11 @@ def country_to_entity_id( assert by == "name", "create_entities works only with `by='name'`" ix = entity_id.isnull() # cast to float to fix issues with categories - entity_id[ix] = country[ix].map(_get_and_create_entities_in_db(set(country[ix]))).astype(float) - - if entity_id.isnull().any(): - msg = f"Some countries have not been mapped: {set(country[entity_id.isnull()])}" - if errors == "raise": - raise ValueError(msg) - elif errors == "warn": - warnings.warn(msg) - elif errors == "ignore": - pass - - # Int64 allows NaN values - return cast(pd.Series, entity_id.astype("Int64")) - else: - return cast(pd.Series, entity_id.astype(int)) + entity_id[ix] = country[ix].map(_get_and_create_entities_in_db(set(country[ix]), engine=engine)).astype(float) + + assert not entity_id.isnull().any(), f"Some countries have not been mapped: {set(country[entity_id.isnull()])}" + + return cast(pd.Series, entity_id.astype(int)) def _unique(x: List[Any]) -> List[Any]: @@ -460,7 +480,7 @@ def _adapt_dataset_metadata_for_grapher( def _adapt_table_for_grapher( - table: catalog.Table, country_col: str = "country", year_col: str = "year" + table: catalog.Table, engine: Engine | None = None, country_col: str = "country", year_col: str = "year" ) -> catalog.Table: """Adapt table (from a garden dataset) to be used in a grapher step. This function is not meant to be run explicitly, but by default in the grapher step. @@ -498,8 +518,10 @@ def _adapt_table_for_grapher( assert {"year", country_col} <= set(table.columns), f"Table must have columns {country_col} and year." assert "entity_id" not in table.columns, "Table must not have column entity_id." + table[country_col] = table[country_col].astype(str) + # Grapher needs a column entity id, that is constructed based on the unique entity names in the database. - table["entity_id"] = country_to_entity_id(table[country_col], create_entities=True) + table["entity_id"] = country_to_entity_id(table[country_col], create_entities=True, engine=engine) table = table.drop(columns=[country_col]).rename(columns={year_col: "year"}) table = table.set_index(["entity_id", "year"] + dim_names) diff --git a/etl/grapher_import.py b/etl/grapher_import.py index 97533efc383..5062b30c652 100644 --- a/etl/grapher_import.py +++ b/etl/grapher_import.py @@ -30,7 +30,7 @@ ) from apps.backport.datasync.datasync import upload_gzip_dict from etl import config -from etl.db import open_db +from etl.db import get_engine from . import grapher_helpers as gh from . import grapher_model as gm @@ -213,10 +213,12 @@ def upsert_table( "Tables to be upserted must have no null values. Instead they" f" have:\n{table.loc[table.iloc[:, 0].isnull()]}" ) table = table.reorder_levels(["year", "entity_id"]) - assert table.index.dtypes[0] in gh.INT_TYPES, f"year must be of an integer type but was: {table.index.dtypes[0]}" assert ( - table.index.dtypes[1] in gh.INT_TYPES - ), f"entity_id must be of an integer type but was: {table.index.dtypes[1]}" + table.index.dtypes.iloc[0] in gh.INT_TYPES + ), f"year must be of an integer type but was: {table.index.dtypes.iloc[0]}" + assert ( + table.index.dtypes.iloc[1] in gh.INT_TYPES + ), f"entity_id must be of an integer type but was: {table.index.dtypes.iloc[1]}" utils.validate_underscore(table.metadata.short_name, "Table's short_name") utils.validate_underscore(table.columns[0], "Variable's name") @@ -332,7 +334,7 @@ def fetch_db_checksum(dataset: catalog.Dataset) -> Optional[str]: assert dataset.metadata.version, "Dataset must have a version" assert dataset.metadata.namespace, "Dataset must have a namespace" - with Session(gm.get_engine()) as session: + with Session(get_engine()) as session: q = select(gm.Dataset).where( gm.Dataset.shortName == dataset.metadata.short_name, gm.Dataset.version == dataset.metadata.version, @@ -343,7 +345,7 @@ def fetch_db_checksum(dataset: catalog.Dataset) -> Optional[str]: def set_dataset_checksum_and_editedAt(dataset_id: int, checksum: str) -> None: - with Session(gm.get_engine()) as session: + with Session(get_engine()) as session: q = ( update(gm.Dataset) .where(gm.Dataset.id == dataset_id) @@ -357,7 +359,7 @@ def set_dataset_checksum_and_editedAt(dataset_id: int, checksum: str) -> None: session.commit() -def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) -> None: +def cleanup_ghost_variables(engine: Engine, dataset_id: int, upserted_variable_ids: List[int]) -> None: """Remove all leftover variables that didn't get upserted into DB during grapher step. This could happen when you rename or delete a variable in ETL. Raise an error if we try to delete variable used by any chart. @@ -366,15 +368,14 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - :param upserted_variable_ids: variables upserted in grapher step :param workers: delete variables in parallel """ - with open_db() as db: + with engine.connect() as con: # get all those variables first - db.cursor.execute( + rows = con.execute( """ SELECT id FROM variables WHERE datasetId=%(dataset_id)s AND id NOT IN %(variable_ids)s """, {"dataset_id": dataset_id, "variable_ids": upserted_variable_ids or [-1]}, - ) - rows = db.cursor.fetchall() + ).fetchall() variable_ids_to_delete = [row[0] for row in rows] @@ -385,19 +386,18 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - log.info("cleanup_ghost_variables.start", size=len(variable_ids_to_delete)) # raise an exception if they're used in any charts - db.cursor.execute( + rows = con.execute( """ SELECT chartId, variableId FROM chart_dimensions WHERE variableId IN %(variable_ids)s """, {"dataset_id": dataset_id, "variable_ids": variable_ids_to_delete}, - ) - rows = db.cursor.fetchall() + ).fetchall() if rows: rows = pd.DataFrame(rows, columns=["chartId", "variableId"]) raise ValueError(f"Variables used in charts will not be deleted automatically:\n{rows}") # then variables themselves with related data in other tables - db.cursor.execute( + con.execute( """ DELETE FROM country_latest_data WHERE variable_id IN %(variable_ids)s """, @@ -405,19 +405,19 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - ) # delete relationships - db.cursor.execute( + con.execute( """ DELETE FROM origins_variables WHERE variableId IN %(variable_ids)s """, {"variable_ids": variable_ids_to_delete}, ) - db.cursor.execute( + con.execute( """ DELETE FROM tags_variables_topic_tags WHERE variableId IN %(variable_ids)s """, {"variable_ids": variable_ids_to_delete}, ) - db.cursor.execute( + con.execute( """ DELETE FROM posts_gdocs_variables_faqs WHERE variableId IN %(variable_ids)s """, @@ -425,7 +425,7 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - ) # delete them from explorers - db.cursor.execute( + con.execute( """ DELETE FROM explorer_variables WHERE variableId IN %(variable_ids)s """, @@ -433,7 +433,7 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - ) # finally delete variables - db.cursor.execute( + result = con.execute( """ DELETE FROM variables WHERE datasetId=%(dataset_id)s AND id IN %(variable_ids)s """, @@ -442,34 +442,34 @@ def cleanup_ghost_variables(dataset_id: int, upserted_variable_ids: List[int]) - log.warning( "cleanup_ghost_variables.end", - size=db.cursor.rowcount, + size=result.rowcount, variables=variable_ids_to_delete, ) -def cleanup_ghost_sources(dataset_id: int, upserted_source_ids: List[int]) -> None: +def cleanup_ghost_sources(engine: Engine, dataset_id: int, upserted_source_ids: List[int]) -> None: """Remove all leftover sources that didn't get upserted into DB during grapher step. This could happen when you rename or delete sources. :param dataset_id: ID of the dataset :param upserted_source_ids: sources upserted in grapher step """ - with open_db() as db: + with engine.connect() as con: if upserted_source_ids: - db.cursor.execute( + result = con.execute( """ DELETE FROM sources WHERE datasetId=%(dataset_id)s AND id NOT IN %(source_ids)s """, {"dataset_id": dataset_id, "source_ids": upserted_source_ids}, ) else: - db.cursor.execute( + result = con.execute( """ DELETE FROM sources WHERE datasetId=%(dataset_id)s """, {"dataset_id": dataset_id}, ) - if db.cursor.rowcount > 0: - log.warning(f"Deleted {db.cursor.rowcount} ghost sources") + if result.rowcount > 0: + log.warning(f"Deleted {result.rowcount} ghost sources") def _get_entity_name(session: Session, entity_id: int) -> str: diff --git a/etl/grapher_model.py b/etl/grapher_model.py index 733e27197f8..1f777c807bd 100644 --- a/etl/grapher_model.py +++ b/etl/grapher_model.py @@ -8,7 +8,6 @@ from datetime import date, datetime from pathlib import Path from typing import Annotated, Any, Dict, List, Literal, Optional, TypedDict, Union, get_args -from urllib.parse import quote import humps import pandas as pd @@ -35,7 +34,6 @@ VARCHAR, ) from sqlalchemy.exc import NoResultFound -from sqlalchemy.future import Engine as _FutureEngine from sqlmodel import JSON as _JSON from sqlmodel import ( Column, @@ -43,7 +41,6 @@ Relationship, Session, SQLModel, - create_engine, or_, select, ) @@ -51,6 +48,7 @@ from etl import config, paths from etl.config import GRAPHER_USER_ID +from etl.db import read_sql log = structlog.get_logger() @@ -67,13 +65,6 @@ JSON = _JSON(none_as_null=True) -def get_engine() -> _FutureEngine: - return create_engine( - f"mysql://{config.DB_USER}:{quote(config.DB_PASS)}@{config.DB_HOST}:{config.DB_PORT}/{config.DB_NAME}", - future=False, - ) - - t_active_datasets = Table( "active_datasets", metadata, @@ -717,9 +708,9 @@ def load_sources( ) t order by t.id """ - sources = pd.read_sql( + sources = read_sql( q, - session.bind, + session.bind, # type: ignore params={ "datasetId": dataset_id, # NOTE: query doesn't work with empty list so we use a dummy value @@ -737,7 +728,7 @@ def load_sources( ) sources.datasetId = sources.datasetId.fillna(dataset_id).astype(int) - return [cls(**d) for d in sources.to_dict(orient="records") if cls.validate(d)] + return [cls(**d) for d in sources.to_dict(orient="records") if cls.validate(d)] # type: ignore class SuggestedChartRevisions(SQLModel, table=True): diff --git a/etl/helpers.py b/etl/helpers.py index e769fa63c70..946a884f097 100644 --- a/etl/helpers.py +++ b/etl/helpers.py @@ -8,6 +8,7 @@ import tempfile from collections.abc import Generator from contextlib import contextmanager +from functools import cache from pathlib import Path from typing import Any, Dict, Iterable, Iterator, List, Literal, Optional, Union, cast from urllib.parse import urljoin @@ -377,6 +378,10 @@ class WrongStepName(ExceptionFromDocstring): """Wrong step name. If this step was in the dag, it should be corrected.""" +# loading DAG can take up to 1 second, so cache it +load_dag_cached = cache(load_dag) + + class PathFinder: """Helper object with naming conventions. It uses your module path (__file__) and extracts from it commonly used attributes like channel / namespace / version / short_name or @@ -390,11 +395,8 @@ class PathFinder: def __init__(self, __file__: str, is_private: Optional[bool] = None): self.f = Path(__file__) - # Load dag. - if "/archive/" in __file__: - self.dag = load_dag(paths.DAG_ARCHIVE_FILE) - else: - self.dag = load_dag() + # Lazy load dag when needed. + self._dag = None # Current file should be a data step. if not self.f.as_posix().startswith(paths.STEP_DIR.as_posix()): @@ -412,6 +414,16 @@ def __init__(self, __file__: str, is_private: Optional[bool] = None): # Default logger self.log = structlog.get_logger(step=f"{self.namespace}/{self.channel}/{self.version}/{self.short_name}") + @property + def dag(self): + """Lazy loading of DAG.""" + if self._dag is None: + if "/archive/" in str(self.f): + self._dag = load_dag_cached(paths.DAG_ARCHIVE_FILE) + else: + self._dag = load_dag_cached() + return self._dag + @property def channel(self) -> str: return self.f.parent.parent.parent.name diff --git a/etl/paths.py b/etl/paths.py index 30e465b6feb..a3fa8889535 100644 --- a/etl/paths.py +++ b/etl/paths.py @@ -62,3 +62,6 @@ # Use paths.DAG_ARCHIVE_FILE to load the complete dag, with active and archive steps. # Otherwise use paths.DAG_FILE to load only active steps, ignoring archive ones. DEFAULT_DAG_FILE = DAG_FILE + +# Hidden ETL file that will keep the time it took to execute each step. +EXECUTION_TIME_FILE = BASE_DIR / ".execution_time.json" diff --git a/etl/steps/__init__.py b/etl/steps/__init__.py index 92b83cf33cc..af9ccda8d60 100644 --- a/etl/steps/__init__.py +++ b/etl/steps/__init__.py @@ -29,6 +29,7 @@ from owid.walden import CATALOG as WALDEN_CATALOG from owid.walden import Catalog as WaldenCatalog from owid.walden import Dataset as WaldenDataset +from sqlalchemy.engine import Engine from etl import config, files, git, paths from etl import grapher_helpers as gh @@ -518,7 +519,8 @@ def _output_dataset(self) -> catalog.Dataset: return catalog.Dataset(self._dest_dir.as_posix()) def checksum_output(self) -> str: - return self._output_dataset.checksum() + # output checksum is checksum of all ingredients + return self.checksum_input() def _step_files(self) -> List[str]: "Return a list of code files defining this step." @@ -714,12 +716,7 @@ def has_existing_data(self) -> bool: return True def checksum_output(self) -> str: - # NOTE: we could use the checksum from `_dvc_path` to - # speed this up. Test the performance on - # time poetry run etl run garden --dry-run - # Make sure that the checksum below is the same as DVC checksum! It - # looks like it might be different for some reason - return files.checksum_file(self._dvc_path) + return Snapshot(self.path).m.outs[0]["md5"] @property def _dvc_path(self) -> str: @@ -827,7 +824,7 @@ def run(self) -> None: cols += [c for c in table.columns if c in {"year", "country"} and c not in cols] table = table.loc[:, cols] - table = gh._adapt_table_for_grapher(table) + table = gh._adapt_table_for_grapher(table, engine) for t in gh._yield_wide_table(table, na_action="drop"): i += 1 @@ -857,7 +854,7 @@ def run(self) -> None: variable_upsert_results = [future.result() for future in as_completed(futures)] if not config.GRAPHER_FILTER and not config.SUBSET: - self._cleanup_ghost_resources(dataset_upsert_results, variable_upsert_results) + self._cleanup_ghost_resources(engine, dataset_upsert_results, variable_upsert_results) # set checksum and updatedAt timestamps after all data got inserted gi.set_dataset_checksum_and_editedAt(dataset_upsert_results.dataset_id, self.data_step.checksum_input()) @@ -868,6 +865,7 @@ def checksum_output(self) -> str: @classmethod def _cleanup_ghost_resources( cls, + engine: Engine, dataset_upsert_results, variable_upsert_results: List[Any], ) -> None: @@ -886,10 +884,11 @@ def _cleanup_ghost_resources( # Try to cleanup ghost variables, but make sure to raise an error if they are used # in any chart gi.cleanup_ghost_variables( + engine, dataset_upsert_results.dataset_id, upserted_variable_ids, ) - gi.cleanup_ghost_sources(dataset_upsert_results.dataset_id, upserted_source_ids) + gi.cleanup_ghost_sources(engine, dataset_upsert_results.dataset_id, upserted_source_ids) # TODO: cleanup origins that are not used by any variable diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py b/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py index ac765801239..8e4756f09ec 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-08/yougov_end_of_humanity.py @@ -61,6 +61,7 @@ def run(dest_dir: str) -> None: # Transform the 'melted_df_all_age_groups' dataframe into a pivot table with 'options' as index and # each unique value in 'melted_columns' as a column. Store the pivot table in 'pivot_df_all_age_groups'. + melted_df_all_age_groups = melted_df_all_age_groups.astype({"melted_columns": "category"}) pivot_df_all_age_groups = melted_df_all_age_groups.pivot_table( index=["options"], columns="melted_columns", values="value" ) diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py index 9fa62b488c5..913118d3c60 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-14/ai_national_strategy.py @@ -21,7 +21,9 @@ def run(dest_dir: str) -> None: ds_meadow = cast(Dataset, paths.load_dependency("ai_national_strategy")) # Load region dataset to find all possible countries and later fill the ones that don't exist in the spreadsheet as not released (according to source that's the implication) ds_regions = cast(Dataset, paths.load_dependency("regions")) - countries_national_ai = pd.DataFrame(ds_regions["regions"]["name"]) + tb_regions = ds_regions["regions"] + tb_regions = tb_regions[tb_regions["defined_by"] == "owid"] + countries_national_ai = pd.DataFrame(tb_regions["name"]) countries_national_ai.reset_index(drop=True, inplace=True) countries_national_ai["released"] = np.NaN # Generate the column names from "2017" to "2022" @@ -58,10 +60,10 @@ def run(dest_dir: str) -> None: # Check if any year for the current country is not NaN if not group["released_national_strategy_on_ai"].isna().all(): # Forward fill NaN values after "Released" - group["released_national_strategy_on_ai"].fillna(method="ffill", inplace=True) + group["released_national_strategy_on_ai"] = group["released_national_strategy_on_ai"].fillna(method="ffill") # Fill remaining NaN values with "Not Released" - group["released_national_strategy_on_ai"].fillna("Not released", inplace=True) + group["released_national_strategy_on_ai"] = group["released_national_strategy_on_ai"].fillna("Not released") df_merged.loc[group.index] = group df_merged.drop("released", axis=1, inplace=True) tb = Table(df_merged, short_name=paths.short_name, underscore=True) diff --git a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py index bfe7cd4e1c7..68e158aa09f 100644 --- a/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py +++ b/etl/steps/data/garden/artificial_intelligence/2023-06-26/ai_wrp_2021.py @@ -154,7 +154,7 @@ def calculate_percentage(df, column, valid_responses_dict, column_to_split_by): df_filtered = df[[column_to_split_by, "year", column]][valid_responses].reset_index(drop=True) # Group by country and year - grouped = df_filtered.groupby([column_to_split_by, "year"]) + grouped = df_filtered.groupby([column_to_split_by, "year"], observed=True) # Count valid responses counts = grouped[column].value_counts().reset_index(name="count") @@ -343,7 +343,7 @@ def pivot_by_category(df, question): # Iterate over each pivot column for pivot_col in cols_pivot: # Pivot the dataframe for the current pivot column - pivoted_df = pd.pivot_table(df, values=question, index=["country", "year"], columns=pivot_col) + pivoted_df = pd.pivot_table(df, values=question, index=["country", "year"], columns=pivot_col, observed=True) # Append the pivot table to the list pivot_tables.append(pivoted_df) diff --git a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml index 33ca7d1a1ad..89456c2164a 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml +++ b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.meta.yml @@ -27,6 +27,7 @@ definitions: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: update_period_days: 365 + non_redistributable: true tables: dynabench: diff --git a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py index 208d80198a5..c0d48e24e02 100644 --- a/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py +++ b/etl/steps/data/garden/artificial_intelligence/2024-04-02/dynabench.py @@ -24,6 +24,12 @@ def run(dest_dir: str) -> None: # Selecting the best performance for each benchmark per year tb = tb.groupby(["benchmark", "year"])["performance"].max().reset_index().copy_metadata(from_table=tb) + # Set the first year's performance to the baseline of –1 for each benchmark. + # This is to preserve a baseline for –1 for all benchmarks, + # even when a second, better performance is recorded in a later year. + tb = tb.sort_values(by=["benchmark", "year"]) + tb.loc[tb.groupby("benchmark").head(1).index, "performance"] = -1 + mapping = { "MNIST": "Handwriting recognition", "Switchboard": "Speech recognition", diff --git a/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml new file mode 100644 index 00000000000..2a5bbd540b2 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.meta.yml @@ -0,0 +1,24 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: Climate Change Impacts + update_period_days: 60 + +tables: + climate_change_impacts_annual: + title: Climate Change Impacts - Annual + variables: + arctic_sea_ice_extent_min: + title: Minimum Arctic sea ice extent + arctic_sea_ice_extent_max: + title: Maximum Arctic sea ice extent + antarctic_sea_ice_extent_min: + title: Minimum Antarctic sea ice extent + antarctic_sea_ice_extent_max: + title: Maximum Antarctic sea ice extent + climate_change_impacts_monthly: + title: Climate Change Impacts - Monthly diff --git a/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py new file mode 100644 index 00000000000..38f00ffd808 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/climate_change_impacts.py @@ -0,0 +1,174 @@ +"""Create a garden dataset with all climate change impacts data. + +""" + +from owid.catalog import Table +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_sea_ice_extent(tb_nsidc: Table) -> Table: + tb_nsidc = tb_nsidc.copy() + # Create a table with the minimum and maximum Arctic sea ice extent. + # Assume minimum and maximum occur in September and February every year. + tb_nsidc["month"] = tb_nsidc["date"].astype(str).str[5:7] + tb_nsidc["year"] = tb_nsidc["date"].astype(str).str[0:4].astype(int) + arctic_sea_ice_extent = ( + tb_nsidc[(tb_nsidc["location"] == "Northern Hemisphere") & (tb_nsidc["month"].isin(["02", "09"]))] + .pivot(index=["location", "year"], columns=["month"], values="sea_ice_extent", join_column_levels_with=" ") + .rename(columns={"02": "arctic_sea_ice_extent_max", "09": "arctic_sea_ice_extent_min"}, errors="raise") + ) + # Instead of calling the location a generic "Northern Hemisphere", call it "Arctic Ocean". + arctic_sea_ice_extent["location"] = "Arctic Ocean" + + # Idem for the Antarctic sea ice extent. + # Assume maximum and minimum occur in September and February every year. + antarctic_sea_ice_extent = ( + tb_nsidc[(tb_nsidc["location"] == "Southern Hemisphere") & (tb_nsidc["month"].isin(["02", "09"]))] + .pivot(index=["location", "year"], columns=["month"], values="sea_ice_extent", join_column_levels_with=" ") + .rename(columns={"02": "antarctic_sea_ice_extent_min", "09": "antarctic_sea_ice_extent_max"}, errors="raise") + ) + # Instead of calling the location a generic "Southern Hemisphere", call it "Antarctica". + antarctic_sea_ice_extent["location"] = "Antarctica" + + return arctic_sea_ice_extent, antarctic_sea_ice_extent + + +def prepare_ocean_heat_content(tb_ocean_heat_annual: Table, tb_ocean_heat_annual_epa: Table) -> Table: + # Combine NOAA's annual data on ocean heat content (which is more up-to-date) with the analogous EPA's data based on + # NOAA (which, for some reason, spans a longer time range for 2000m). Prioritize NOAA's data on common years. + tb_ocean_heat_annual = combine_two_overlapping_dataframes( + tb_ocean_heat_annual.rename( + columns={ + "ocean_heat_content_700m": "ocean_heat_content_noaa_700m", + "ocean_heat_content_2000m": "ocean_heat_content_noaa_2000m", + }, + errors="raise", + ), + tb_ocean_heat_annual_epa, + index_columns=["location", "year"], + ) + # Recover the original indicator titles (they are empty because of combining two columns with different titles). + tb_ocean_heat_annual["ocean_heat_content_noaa_700m"].metadata.title = tb_ocean_heat_annual_epa[ + "ocean_heat_content_noaa_700m" + ].metadata.title + tb_ocean_heat_annual["ocean_heat_content_noaa_2000m"].metadata.title = tb_ocean_heat_annual_epa[ + "ocean_heat_content_noaa_2000m" + ].metadata.title + + return tb_ocean_heat_annual + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load GISS dataset surface temperature analysis, and read monthly data. + ds_giss = paths.load_dataset("surface_temperature_analysis") + tb_giss = ds_giss["surface_temperature_analysis"].reset_index() + + # Load NSIDC dataset of sea ice index. + ds_nsidc = paths.load_dataset("sea_ice_index") + tb_nsidc = ds_nsidc["sea_ice_index"].reset_index() + + # Load Met Office dataset on sea surface temperature. + ds_met_office = paths.load_dataset("sea_surface_temperature") + tb_met_office = ds_met_office["sea_surface_temperature"].reset_index() + + # Load NOAA/NCIE dataset on ocean heat content. + ds_ocean_heat = paths.load_dataset("ocean_heat_content", namespace="climate") + tb_ocean_heat_monthly = ds_ocean_heat["ocean_heat_content_monthly"].reset_index() + tb_ocean_heat_annual = ds_ocean_heat["ocean_heat_content_annual"].reset_index() + + # Load EPA's compilation of data on ocean heat content. + ds_epa = paths.load_dataset("ocean_heat_content", namespace="epa") + tb_ocean_heat_annual_epa = ds_epa["ocean_heat_content"].reset_index() + + # Load ocean pH data from the School of Ocean and Earth Science and Technology. + ds_ocean_ph = paths.load_dataset("ocean_ph_levels") + tb_ocean_ph = ds_ocean_ph["ocean_ph_levels"].reset_index() + + # Load snow cover extent from Rutgers University Global Snow Lab. + ds_snow = paths.load_dataset("snow_cover_extent") + tb_snow = ds_snow["snow_cover_extent"].reset_index() + + # Load ice sheet mass balance data from EPA. + ds_ice_sheet = paths.load_dataset("ice_sheet_mass_balance") + tb_ice_sheet = ds_ice_sheet["ice_sheet_mass_balance"].reset_index() + + # Load annual data on mass balance of US glaciers from EPA. + ds_us_glaciers = paths.load_dataset("mass_balance_us_glaciers") + tb_us_glaciers = ds_us_glaciers["mass_balance_us_glaciers"].reset_index() + + # Load monthly greenhouse gas concentration data from NOAA/GML. + ds_gml = paths.load_dataset("ghg_concentration") + tb_gml = ds_gml["ghg_concentration"].reset_index() + + # Load long-run yearly greenhouse gas concentration data. + ds_ghg = paths.load_dataset("long_run_ghg_concentration") + tb_ghg = ds_ghg["long_run_ghg_concentration"].reset_index() + + # Load global sea level. + ds_sea_level = paths.load_dataset("global_sea_level") + tb_sea_level = ds_sea_level["global_sea_level"].reset_index() + + # + # Process data. + # + # Prepare sea ice extent data. + arctic_sea_ice_extent, antarctic_sea_ice_extent = prepare_sea_ice_extent(tb_nsidc=tb_nsidc) + + # Prepare ocean heat content data. + tb_ocean_heat_annual = prepare_ocean_heat_content( + tb_ocean_heat_annual=tb_ocean_heat_annual, tb_ocean_heat_annual_epa=tb_ocean_heat_annual_epa + ) + + # Gather monthly data from different tables. + tb_monthly = tb_giss.astype({"date": str}).copy() + # NOTE: The values in tb_ocean_ph are monthly, but the dates are not consistently on the middle of the month. + # Instead, they are on different days of the month. When merging with other tables, this will create many nans. + # We could reindex linearly, but it's not a big deal. + for table in [ + tb_nsidc, + tb_met_office, + tb_ocean_heat_monthly, + tb_ocean_ph, + tb_snow, + tb_ice_sheet, + tb_gml, + tb_sea_level, + ]: + tb_monthly = tb_monthly.merge( + table.astype({"date": str}), + how="outer", + on=["location", "date"], + validate="one_to_one", + short_name="climate_change_impacts_monthly", + ) + + # Gather annual data from different tables. + tb_annual = tb_ocean_heat_annual.copy() + for table in [arctic_sea_ice_extent, antarctic_sea_ice_extent, tb_ghg, tb_us_glaciers.astype({"year": int})]: + tb_annual = tb_annual.merge( + table, + how="outer", + on=["location", "year"], + validate="one_to_one", + short_name="climate_change_impacts_annual", + ) + tb_annual.metadata.short_name = "climate_change_impacts_annual" + + # Set an appropriate index to monthly and annual tables, and sort conveniently. + tb_monthly = tb_monthly.set_index(["location", "date"], verify_integrity=True).sort_index() + tb_annual = tb_annual.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create explorer dataset with combined table in csv format. + ds_explorer = create_dataset(dest_dir, tables=[tb_annual, tb_monthly]) + ds_explorer.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml new file mode 100644 index 00000000000..ca5e6073998 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.meta.yml @@ -0,0 +1,44 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + description_short: Measured in parts per million. + +dataset: + update_period_days: 60 + +tables: + ghg_concentration: + title: Monthly greenhouse gas concentration + variables: + co2_concentration: + title: Monthly concentration of atmospheric carbon dioxide + processing_level: minor + unit: parts per million + short_unit: ppm + ch4_concentration: + title: Monthly concentration of atmospheric methane + processing_level: minor + unit: parts per billion + short_unit: ppb + n2o_concentration: + title: Monthly concentration of atmospheric nitrous oxide + processing_level: minor + unit: parts per billion + short_unit: ppb + co2_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric carbon dioxide + processing_level: major + unit: parts per million + short_unit: ppm + ch4_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric methane + processing_level: major + unit: parts per billion + short_unit: ppb + n2o_concentration_yearly_average: + title: Rolling yearly average of the concentration of atmospheric nitrous oxide + processing_level: major + unit: parts per billion + short_unit: ppb diff --git a/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py new file mode 100644 index 00000000000..36d76ea290b --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ghg_concentration.py @@ -0,0 +1,143 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from typing import List + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "year": "year", + "month": "month", + "average": "concentration", + # The following column is loaded only to perform a sanity check. + "decimal": "decimal", +} + + +def add_rolling_average(tb: Table, original_column_names: List[str]) -> Table: + tb_with_average = tb.copy() + + # Create a date range of each month (on the 15th). + # NOTE: The minimum date in the data is "2001-01-15", however, when passing this date to pd.date_range with + # freq="MS", the first point is dismissed because it is not the start of a month. For that reason, we shift the + # first point to be at the beginning of the month. + date_range = pd.date_range( + start=tb_with_average["date"].min() - pd.tseries.offsets.MonthBegin(1), + end=tb_with_average["date"].max(), + freq="MS", + ) + pd.DateOffset(days=14) + + # Get unique locations. + unique_locations = tb_with_average["location"].unique() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index(["location", "date"]).sort_index() + + # Create a MultiIndex with all possible combinations of date and location. + multi_index = pd.MultiIndex.from_product([unique_locations, date_range], names=["location", "date"]) + + # Reindex using the MultiIndex. + tb_with_average = tb_with_average.reindex(multi_index) + + for original_column_name in original_column_names: + # Create a rolling average with a window of one year, linearly interpolating missing values. + # NOTE: Currently no interpolation is needed, as no data points are missing (and in fact date_range is identical + # to the dates in the data). However, we need to interpolate in case there are missing points. Otherwise all + # points after the missing one will be nan. + tb_with_average[f"{original_column_name}_yearly_average"] = ( + tb_with_average[original_column_name] + .interpolate("linear") + .rolling(12) + .mean() + .copy_metadata(tb_with_average[original_column_name]) + ) + + # Drop empty rows. + tb_with_average = tb_with_average.dropna(subset=original_column_names, how="all").reset_index() + + # Sort conveniently. + tb_with_average = tb_with_average.sort_values(["location", "date"]).reset_index(drop=True) + + for original_column_name in original_column_names: + # Check that the values of the original column have not been altered. + error = f"The values of the original {original_column_name} column have been altered." + assert tb_with_average[original_column_name].astype(float).equals(tb[original_column_name].astype(float)), error + + return tb_with_average + + +def prepare_gas_data(tb: Table) -> Table: + tb = tb.copy() + + # Extract gas name from table's short name. + gas = tb.metadata.short_name.split("_")[0] + + # Columns to select from the data, and how to rename them. + columns = { + "year": "year", + "month": "month", + "average": f"{gas}_concentration", + # The following column is loaded only to perform a sanity check. + "decimal": "decimal", + } + + # Select necessary columns and rename them. + tb = tb[list(columns)].rename(columns=columns, errors="raise") + + # There is a "decimal" column for the year as a decimal number, that only has 12 possible values, corresponding to + # the middle of each month, so we will assume the 15th of each month. + error = "Date format has changed." + assert len(set(tb["decimal"].astype(str).str.split(".").str[1])) == 12, error + assert set(tb["month"]) == set(range(1, 13)), error + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=15)) + + # Remove unnecessary columns. + tb = tb.drop(columns=["year", "month", "decimal"], errors="raise") + + # Add a location column. + tb["location"] = "World" + + # Add a column with a rolling average for each gas. + tb = add_rolling_average(tb=tb, original_column_names=[f"{gas}_concentration"]) + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("ghg_concentration") + tb_co2 = ds_meadow["co2_concentration_monthly"].reset_index() + tb_ch4 = ds_meadow["ch4_concentration_monthly"].reset_index() + tb_n2o = ds_meadow["n2o_concentration_monthly"].reset_index() + + # + # Process data. + # + # Prepare data for each gas. + tb_co2 = prepare_gas_data(tb=tb_co2) + tb_ch4 = prepare_gas_data(tb=tb_ch4) + tb_n2o = prepare_gas_data(tb=tb_n2o) + + # Combine data for different gases. + tb = tb_co2.merge(tb_ch4, how="outer", on=["location", "date"]).merge( + tb_n2o, how="outer", on=["location", "date"], short_name=paths.short_name + ) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml new file mode 100644 index 00000000000..b02cba814ea --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.meta.yml @@ -0,0 +1,27 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + - CO2 & Greenhouse Gas Emissions + description_processing: |- + - Long-run data from ice core studies has been merged with recent measurements of atmospheric concentration of greenhouse gases. + +dataset: + update_period_days: 0 + +tables: + long_run_ghg_concentration: + variables: + co2_concentration: + title: Long-run CO₂ concentration + unit: parts per million volume + short_unit: ppmv + ch4_concentration: + title: Long-run CH₄ concentration + unit: parts per billion volume + short_unit: ppbv + n2o_concentration: + title: Long-run N₂O concentration + unit: parts per billion volume + short_unit: ppbv diff --git a/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py new file mode 100644 index 00000000000..0e07095b425 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/long_run_ghg_concentration.py @@ -0,0 +1,84 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table +from owid.datautils.dataframes import combine_two_overlapping_dataframes + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def convert_monthly_to_annual(tb_new: Table) -> Table: + tb_new = tb_new.copy() + + # Create a year column. + tb_new["year"] = tb_new["date"].dt.year + + # Create a table with the number of observations per year. + tb_counts = tb_new.groupby("year", as_index=False).agg( + { + "co2_concentration": "count", + "ch4_concentration": "count", + "n2o_concentration": "count", + } + ) + # Create a table with the average annual values. + tb_new = tb_new.groupby("year", as_index=False).agg( + { + "co2_concentration": "mean", + "ch4_concentration": "mean", + "n2o_concentration": "mean", + } + ) + # Make nan all data points based on less than 12 observations per year. + for gas in ["co2", "ch4", "n2o"]: + tb_new.loc[tb_counts[f"{gas}_concentration"] < 12, f"{gas}_concentration"] = None + + # Drop empty rows. + tb_new = tb_new.dropna( + subset=["co2_concentration", "ch4_concentration", "n2o_concentration"], how="all" + ).reset_index(drop=True) + + return tb_new + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset on long-run GHG concentrations from EPA, and read its main table. + ds_old = paths.load_dataset("ghg_concentration", namespace="epa") + tb_old = ds_old["ghg_concentration"].reset_index() + + # Load garden dataset of up-to-date GHG concentrations, and read its main table. + ds_new = paths.load_dataset("ghg_concentration", namespace="climate") + tb_new = ds_new["ghg_concentration"].reset_index() + + # + # Process data. + # + # Select columns. + tb_new = tb_new[["date", "co2_concentration", "ch4_concentration", "n2o_concentration"]].copy() + + # Calculate average annual values. + tb_new = convert_monthly_to_annual(tb_new=tb_new) + + # Combine old and new data, prioritizing the latter. + tb = combine_two_overlapping_dataframes(df1=tb_new, df2=tb_old, index_columns=["year"]) + + # Rename table. + tb.metadata.short_name = paths.short_name + + # Add location column. + tb["location"] = "World" + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml new file mode 100644 index 00000000000..c7f6fb474ea --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.meta.yml @@ -0,0 +1,29 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + description_short: Measured in 10²² Joules. + unit: 10²² Joules + short_unit: 10²² J + +dataset: + title: Ocean Heat Content + update_period_days: 60 + +tables: + ocean_heat_content_monthly: + title: Ocean Heat Content - Monthly average + variables: + ocean_heat_content_700m: + title: Monthly average ocean heat content for the 0-700 meters layer + ocean_heat_content_2000m: + title: Monthly average ocean heat content for the 0-2000 meters layer + ocean_heat_content_annual: + title: Ocean Heat Content - Annual average + variables: + ocean_heat_content_700m: + title: Annual average ocean heat content for the 0-700 meters layer + ocean_heat_content_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py new file mode 100644 index 00000000000..dcbafe0d14c --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ocean_heat_content.py @@ -0,0 +1,45 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("ocean_heat_content") + tb_monthly = ds_meadow["ocean_heat_content_monthly"].reset_index() + tb_annual = ds_meadow["ocean_heat_content_annual"].reset_index() + + # + # Process data. + # + # Improve the format of the date column in monthly date (assume the middle of the month for each data point). + tb_monthly["date"] = ( + tb_monthly["date"].str.split("-").str[0] + "-" + tb_monthly["date"].str.split("-").str[1].str.zfill(2) + "-15" + ) + + # Replace date column (where all years are given as, e.g. 1955.5, 2000.5) by year column in annual data. + tb_annual["year"] = tb_annual["date"].astype(int) + tb_annual = tb_annual.drop(columns=["date"], errors="raise") + + # Instead of having a column for depth, create columns of heat content for each depth. + tb_monthly["depth"] = tb_monthly["depth"].astype(str) + "m" + tb_monthly = tb_monthly.pivot(index=["location", "date"], columns="depth", join_column_levels_with="_") + tb_annual["depth"] = tb_annual["depth"].astype(str) + "m" + tb_annual = tb_annual.pivot(index=["location", "year"], columns="depth", join_column_levels_with="_") + + # Set an appropriate index to each table and sort conveniently. + tb_monthly = tb_monthly.set_index(["location", "date"], verify_integrity=True).sort_index() + tb_annual = tb_annual.set_index(["location", "year"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb_annual, tb_monthly], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml new file mode 100644 index 00000000000..d9364bd3280 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.meta.yml @@ -0,0 +1,22 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + unit: pH + short_unit: pH + +dataset: + title: Ocean pH Levels + update_period_days: 60 + +tables: + ocean_ph_levels: + title: Ocean pH levels + variables: + ocean_ph: + title: Monthly measurement of ocean pH levels + processing_level: minor + ocean_ph_yearly_average: + title: Rolling yearly average of ocean pH levels + processing_level: major diff --git a/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py new file mode 100644 index 00000000000..db98a40272e --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/ocean_ph_levels.py @@ -0,0 +1,86 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "date": "date", + "phcalc_insitu": "ocean_ph", +} + + +def add_rolling_average(tb: Table) -> Table: + tb_with_average = tb.copy() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index("date").sort_index() + + # Since values are given at different days of the month, reindex to have a value for each day. + tb_with_average = tb_with_average.reindex( + pd.date_range(start=tb_with_average.index.min(), end=tb_with_average.index.max(), freq="1D") + ) + + # Create a rolling average with a window of one year, linearly interpolating missing values. + tb_with_average["ocean_ph_yearly_average"] = ( + tb_with_average["ocean_ph"] + .interpolate(method="time") + .rolling(365) + .mean() + .copy_metadata(tb_with_average["ocean_ph"]) + ) + + # Drop empty rows. + tb_with_average = ( + tb_with_average.dropna(subset=["ocean_ph"]).reset_index().rename(columns={"index": "date"}, errors="raise") + ) + + # Check that the values of the original ocean ph column have not been altered. + error = "The values of the original ocean_ph column have been altered." + assert tb_with_average["ocean_ph"].equals( + tb.dropna(subset=["ocean_ph"]).sort_values("date").reset_index(drop=True)["ocean_ph"] + ), error + + return tb_with_average + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("hawaii_ocean_time_series") + tb_meadow = ds_meadow["hawaii_ocean_time_series"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb = tb_meadow[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Add location column. + tb["location"] = "Hawaii" + + # Improve format of date column. + tb["date"] = pd.to_datetime(tb["date"], format="%d-%b-%y") + + # Add a column with a rolling average. + tb = add_rolling_average(tb=tb) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # Rename table. + tb.metadata.short_name = paths.short_name + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml new file mode 100644 index 00000000000..7facebf9240 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.meta.yml @@ -0,0 +1,19 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: Sea Ice Index + update_period_days: 60 + +tables: + sea_ice_index: + variables: + sea_ice_extent: + title: Sea ice extent + # description_short: TODO + unit: million square kilometers + short_unit: million km² + processing_level: minor diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py new file mode 100644 index 00000000000..3f8247e42b5 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/sea_ice_index.py @@ -0,0 +1,44 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("sea_ice_index") + + # Read table from meadow dataset. + tb = ds_meadow["sea_ice_index"].reset_index() + + # + # Process data. + # + # Remove column with annual average. + tb = tb.drop(columns=["annual"]) + + # Convert table to long format. + tb = tb.melt(id_vars=["location", "year"], var_name="month", value_name="sea_ice_extent") + + # Create column of date, assuming each measurement is taken mid month. + tb["date"] = pd.to_datetime(tb["year"].astype(str) + tb["month"].str[0:3] + "15", format="%Y%b%d") + + # Drop empty rows and unnecessary columns. + tb = tb.dropna().drop(columns=["year", "month"]) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml new file mode 100644 index 00000000000..bf9ee9d13dc --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.meta.yml @@ -0,0 +1,29 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + +dataset: + title: Sea surface temperature + update_period_days: 60 + +tables: + sea_surface_temperature: + variables: + sea_temperature_anomaly: + title: "Monthly sea surface temperature anomaly" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C + sea_temperature_anomaly_low: + title: "Monthly sea surface temperature anomaly (lower bound)" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C + sea_temperature_anomaly_high: + title: "Monthly sea surface temperature anomaly (upper bound)" + description_short: Measured in degrees Celsius. + unit: °C + short_unit: °C diff --git a/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py new file mode 100644 index 00000000000..2c2fb56098e --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/sea_surface_temperature.py @@ -0,0 +1,48 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +# Columns to select from data, and how to rename them. +COLUMNS = { + "year": "year", + "month": "month", + "location": "location", + "anomaly": "sea_temperature_anomaly", + "lower_bound_95pct_bias_uncertainty_range": "sea_temperature_anomaly_low", + "upper_bound_95pct_bias_uncertainty_range": "sea_temperature_anomaly_high", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("sea_surface_temperature") + tb = ds_meadow["sea_surface_temperature"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Create a date column (assume the middle of the month for each monthly data point). + tb["date"] = tb["year"].astype(str) + "-" + tb["month"].astype(str).str.zfill(2) + "-15" + + # Remove unnecessary columns. + tb = tb.drop(columns=["year", "month"], errors="raise") + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml new file mode 100644 index 00000000000..698ad73c63f --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.meta.yml @@ -0,0 +1,23 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + unit: "square kilometers" + short_unit: "km²" + description_short: Measured in square kilometers. + +dataset: + title: Snow Cover Extent + update_period_days: 60 + +tables: + snow_cover_extent: + title: Snow Cover Extent + variables: + snow_cover_extent: + title: Monthly measurement of the area covered by snow + processing_level: minor + snow_cover_extent_yearly_average: + title: Rolling yearly average of the area covered by snow + processing_level: major diff --git a/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py new file mode 100644 index 00000000000..f5b5d039b34 --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/snow_cover_extent.py @@ -0,0 +1,97 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from the data, and how to rename them. +COLUMNS = { + "date": "date", + "phcalc_insitu": "ocean_ph", +} + + +def add_rolling_average(tb: Table, original_column_name: str) -> Table: + tb_with_average = tb.copy() + + # Create a date range. + date_range = pd.date_range(start=tb_with_average["date"].min(), end=tb_with_average["date"].max(), freq="1D") + + # Get unique locations. + unique_locations = tb_with_average["location"].unique() + + # Set date as index and sort. + tb_with_average = tb_with_average.set_index(["location", "date"]).sort_index() + + # Create a MultiIndex with all possible combinations of date and location. + multi_index = pd.MultiIndex.from_product([unique_locations, date_range], names=["location", "date"]) + + # Reindex using the MultiIndex. + tb_with_average = tb_with_average.reindex(multi_index) + + # Create a rolling average with a window of one year, linearly interpolating missing values. + tb_with_average[f"{original_column_name}_yearly_average"] = ( + tb_with_average[original_column_name] + .interpolate(method="linear") + .rolling(365) + .mean() + .copy_metadata(tb_with_average[original_column_name]) + ) + + # Drop empty rows. + tb_with_average = tb_with_average.dropna(subset=[original_column_name]).reset_index() + + # Remove rolling average for the first year, given that it is based on incomplete data. + tb_with_average.loc[ + tb_with_average["date"] < tb_with_average["date"].min() + pd.Timedelta(days=365), + f"{original_column_name}_yearly_average", + ] = None + + # Sort conveniently. + tb_with_average = tb_with_average.sort_values(["location", "date"]).reset_index(drop=True) + + # Check that the values of the original column have not been altered. + error = f"The values of the original {original_column_name} column have been altered." + assert tb_with_average[original_column_name].astype(int).equals(tb[original_column_name].astype(int)), error + + return tb_with_average + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("snow_cover_extent") + tb = ds_meadow["snow_cover_extent"].reset_index() + + # + # Process data. + # + # Create a date column. + # NOTE: Assign the middle of the month. + tb["date"] = pd.to_datetime(tb[["year", "month"]].assign(day=15)) + tb = tb.drop(columns=["year", "month"], errors="raise") + + # Data starts in 1966, but, as mentioned on their website + # https://climate.rutgers.edu/snowcover/table_area.php?ui_set=1&ui_sort=0 + # there is missing data between 1968 and 1971. + # So, for simplicity, select data from 1972 onwards, where data is complete. + tb = tb[tb["date"] >= "1972-01-01"].reset_index(drop=True) + + # Add a column with a rolling average. + tb = add_rolling_average(tb=tb, original_column_name="snow_cover_extent") + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml new file mode 100644 index 00000000000..eda07f5ae5a --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.meta.yml @@ -0,0 +1,20 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + +dataset: + title: GISS surface temperature analysis + update_period_days: 60 + +tables: + surface_temperature_analysis: + variables: + temperature_anomaly: + title: "Global warming: monthly temperature anomaly" + description_short: |- + Combined land-surface air and sea-surface water temperature anomaly, given as the deviation from the 1951-1980 mean, in degrees Celsius. + unit: °C + short_unit: °C + processing_level: minor diff --git a/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py new file mode 100644 index 00000000000..43d328abbde --- /dev/null +++ b/etl/steps/data/garden/climate/2024-04-17/surface_temperature_analysis.py @@ -0,0 +1,56 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("surface_temperature_analysis") + tb = ds_meadow["surface_temperature_analysis_world"] + + # + # Process data. + # + # Initialize dictionary to store processed tables. + tables = {} + for table_name in ds_meadow.table_names: + # Read table. + tb = ds_meadow[table_name].reset_index() + # Get location from table name. + location = table_name.split("surface_temperature_analysis_")[-1].replace("_", " ").title() + # Add column for location. + tb["location"] = location + # Convert table to long format. + tb = tb.melt(id_vars=["year", "location"], var_name="month", value_name="temperature_anomaly") + # Create column of date, assuming each measurement is taken mid month. + tb["date"] = pd.to_datetime(tb["year"].astype(str) + tb["month"] + "15", format="%Y%b%d") + # Copy metadata from any other previous column. + tb["date"] = tb["date"].copy_metadata(tb["location"]) + # Select necessary columns. + tb = tb[["location", "date", "temperature_anomaly"]] + # Remove rows with missing values. + tb = tb.dropna(subset=["temperature_anomaly"]).reset_index(drop=True) + # Update table. + tables[location] = tb + + # Concatenate all tables. + tb = pr.concat(list(tables.values()), ignore_index=True, short_name=paths.short_name) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset with the combined table. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/democracy/2024-03-07/bmr.py b/etl/steps/data/garden/democracy/2024-03-07/bmr.py index 578cd9c96f8..74aa650c352 100644 --- a/etl/steps/data/garden/democracy/2024-03-07/bmr.py +++ b/etl/steps/data/garden/democracy/2024-03-07/bmr.py @@ -274,7 +274,7 @@ def add_imputes(tb: Table) -> Table: tb = concat(tb_imputed + [tb], ignore_index=True) # Set to False by default (for non-imputed countries) - tb["regime_imputed"] = tb["regime_imputed"].fillna(False) + tb["regime_imputed"] = tb["regime_imputed"].fillna(False).astype(bool) # Re-order columns cols = [ diff --git a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py index 01dd3c7918a..4c73aaa3796 100644 --- a/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py +++ b/etl/steps/data/garden/education/2023-07-17/education_lee_lee.py @@ -74,13 +74,17 @@ def run(dest_dir: str) -> None: tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) # Replace age group values with descriptive labels - tb["age_group"] = tb["age_group"].replace( - { - "15.0-64.0": "Youth and Adults (15-64 years)", - "15.0-24.0": "Youth (15-24 years)", - "25.0-64.0": "Adults (25-64 years)", - "not specified": "Age not specified", - } + tb["age_group"] = ( + tb["age_group"] + .astype(str) + .replace( + { + "15.0-64.0": "Youth and Adults (15-64 years)", + "15.0-24.0": "Youth (15-24 years)", + "25.0-64.0": "Adults (25-64 years)", + "not specified": "Age not specified", + } + ) ) # Prepare enrollment and attainment data diff --git a/etl/steps/data/garden/education/2023-07-17/shared.py b/etl/steps/data/garden/education/2023-07-17/shared.py index 8db6ff57962..ce0998d945d 100644 --- a/etl/steps/data/garden/education/2023-07-17/shared.py +++ b/etl/steps/data/garden/education/2023-07-17/shared.py @@ -135,7 +135,11 @@ def add_region_aggregates_education( def weighted_mean(x, w): values = np.ma.masked_invalid(x.astype("float64")) weights = np.ma.masked_invalid(w.astype("float64")) - return np.ma.average(values, weights=weights) + out = np.ma.average(values, weights=weights) + if np.ma.is_masked(out): + return np.nan + else: + return out # Create a closure to define variable_agg with specific weights def make_weighted_mean(weights): @@ -149,7 +153,7 @@ def variable_agg(x): else: variable_agg = aggregations[variable] - aggs[variable] = variable_agg + aggs[variable] = variable_agg # type: ignore df_region = groupby_agg( df=df_countries, diff --git a/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py b/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py index 98fd7cb5b14..9f9142fdd66 100644 --- a/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py +++ b/etl/steps/data/garden/emissions/2024-02-26/gdp_and_co2_decoupling.py @@ -9,6 +9,7 @@ TODO: Include link to the updated static chart once it is created. """ + from structlog import get_logger from etl.helpers import PathFinder, create_dataset diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json new file mode 100644 index 00000000000..5b3ccbfe1df --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.countries.json @@ -0,0 +1,227 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antarctica": "Antarctica", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Christmas Island": "Christmas Island", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "EU27": "European Union (27)", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Ethiopia": "Ethiopia", + "Faeroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Polynesia": "French Polynesia", + "GLOBAL": "World", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kyrgyzstan": "Kyrgyzstan", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macao": "Macao", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Occupied Palestinian Territory": "Palestine", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South Korea": "South Korea", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Türkiye": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "USA": "United States", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Viet Nam": "Vietnam", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Kuwaiti Oil Fires": "Kuwaiti Oil Fires", + "Leeward Islands": "Leeward Islands", + "Panama Canal Zone": "Panama Canal Zone", + "Ryukyu Islands": "Ryukyu Islands", + "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla", + "LDC": "Least developed countries (Jones et al.)", + "OECD": "OECD (Jones et al.)" +} diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json new file mode 100644 index 00000000000..f4e1bbdf837 --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.excluded_countries.json @@ -0,0 +1,9 @@ +[ + "ANNEXI", + "ANNEXII", + "BASIC", + "EIT", + "LMDC", + "NONANNEX", + "Pacific Islands (Palau)" +] diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml new file mode 100644 index 00000000000..8d6fd94bf5e --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.meta.yml @@ -0,0 +1,428 @@ +definitions: + measured_in_celsius: &measured-in-celsius |- + Measured in °C. + measured_in_tonnes: &measured-in-tonnes |- + Measured in tonnes. + measured_in_tonnes_per_person: &measured-in-tonnes-per-person |- + Measured in tonnes per person. + measured_in_co2_eq: &measured-in-co2-eq |- + Measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + measured_in_co2_eq_per_person: &measured-in-co2-eq-per-person |- + Measured in tonnes per person of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + ghg_emissions: &ghg-emissions |- + [Greenhouse gas emissions](#dod:ghgemissions) are measured in tonnes of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + ghg_emissions_per_person: &ghg-emissions-per-person |- + [Greenhouse gas emissions](#dod:ghgemissions) are measured in tonnes per person of [carbon dioxide-equivalents](#dod:carbondioxideequivalents) over a 100-year timescale. + processing_methane: &processing-methane |- + Methane emissions in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 29.8 for fossil sources and 27.2 for agricultural and land use sources. These factors are taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + processing_nitrous_oxide: &processing-nitrous-oxide |- + Nitrous oxide emissions in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273. This factor is taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + processing_greenhouse_gases: &processing-greenhouse-gases |- + Emissions given in tonnes have been converted to carbon-dioxide equivalents over a 100-year timescale using a conversion factor of 273 for nitrous oxide, 29.8 for methane from fossil sources, and 27.2 for methane from agricultural and land use sources. These factors are taken from the 6th Assessment Report (AR6) of the Intergovernmental Panel on Climate Change (IPCC). + common: + processing_level: major + presentation: + topic_tags: + - CO2 & Greenhouse Gas Emissions + +dataset: + update_period_days: 365 + description: |- + Jones et al. quantify national and regional contributions to the increase of global mean surface temperature over the last few centuries. + +tables: + national_contributions: + variables: + # Emissions of CH4, CO2, N2O in tonnes (as originally given in the data). + annual_emissions_ch4_fossil: + title: Annual methane emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions from fossil fuels and industry + annual_emissions_ch4_land: + title: Annual methane emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions from agriculture and land use + annual_emissions_ch4_total: + title: Annual methane emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual methane emissions + annual_emissions_co2_fossil: + title: Annual CO₂ emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions from fossil fuels and industry + annual_emissions_co2_land: + title: Annual CO₂ emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions from agriculture and land use + annual_emissions_co2_total: + title: Annual CO₂ emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual CO₂ emissions + annual_emissions_n2o_fossil: + title: Annual nitrous oxide emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions from fossil fuels and industry + annual_emissions_n2o_land: + title: Annual nitrous oxide emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions from agriculture and land use + annual_emissions_n2o_total: + title: Annual nitrous oxide emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Annual nitrous oxide emissions + # Emissions (calculated by OWID) of CH4, CO2, N2O in tonnes of CO2eq, as well as combined GHG emissions in CO2eq. + annual_emissions_ghg_fossil_co2eq: + title: Annual greenhouse gas emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions from fossil fuels and industry + annual_emissions_ghg_land_co2eq: + title: Annual greenhouse gas emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions from agriculture and land use + annual_emissions_ghg_total_co2eq: + title: Annual greenhouse gas emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + description_processing: *processing-greenhouse-gases + presentation: + title_public: Annual greenhouse gas emissions + annual_emissions_ch4_fossil_co2eq: + title: Annual methane emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions from fossil fuels and industry + annual_emissions_ch4_land_co2eq: + title: Annual methane emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions from agriculture and land use + annual_emissions_ch4_total_co2eq: + title: Annual methane emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-methane + presentation: + title_public: Annual methane emissions + annual_emissions_n2o_fossil_co2eq: + title: Annual nitrous oxide emissions from fossil fuels and industry in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions from fossil fuels and industry + annual_emissions_n2o_land_co2eq: + title: Annual nitrous oxide emissions from agriculture and land use in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions from agriculture and land use + annual_emissions_n2o_total_co2eq: + title: Annual nitrous oxide emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + description_processing: *processing-nitrous-oxide + presentation: + title_public: Annual nitrous oxide emissions + # Cumulative emissions of CH4, CO2, N2O and GHG, in tonnes of CO2eq (as originally given in the data). + cumulative_emissions_ghg_fossil: + title: Cumulative greenhouse gas emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions from fossil fuels and industry + cumulative_emissions_ghg_land: + title: Cumulative greenhouse gas emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions from agriculture and land use + cumulative_emissions_ghg_total: + title: Cumulative greenhouse gas emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions + presentation: + title_public: Cumulative greenhouse gas emissions + cumulative_emissions_ch4_fossil: + title: Cumulative methane emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions from fossil fuels and industry + cumulative_emissions_ch4_land: + title: Cumulative methane emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions from agriculture and land use + cumulative_emissions_ch4_total: + title: Cumulative methane emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative methane emissions + cumulative_emissions_co2_fossil: + title: Cumulative CO₂ emissions from fossil fuels and industry + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions from fossil fuels and industry + cumulative_emissions_co2_land: + title: Cumulative CO₂ emissions from agriculture and land use + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions from agriculture and land use + cumulative_emissions_co2_total: + title: Cumulative CO₂ emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes + presentation: + title_public: Cumulative CO₂ emissions + cumulative_emissions_n2o_fossil: + title: Cumulative nitrous oxide emissions from fossil fuels and industry + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions from fossil fuels and industry + cumulative_emissions_n2o_land: + title: Cumulative nitrous oxide emissions from agriculture and land use + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions from agriculture and land use + cumulative_emissions_n2o_total: + title: Cumulative nitrous oxide emissions + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq + presentation: + title_public: Cumulative nitrous oxide emissions + # Temperature response to emissions of CH4, CO2, N2O and GHG, in °C (as originally given in the data). + temperature_response_ghg_fossil: + title: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions from fossil fuels and industry + temperature_response_ghg_land: + title: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions from agriculture and land use + temperature_response_ghg_total: + title: Change in global mean surface temperature caused by greenhouse gas emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by greenhouse gas emissions + temperature_response_ch4_fossil: + title: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by methane emissions from fossil fuels and industry + temperature_response_ch4_land: + title: Change in global mean surface temperature caused by methane emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by methane emissions from agriculture and land use + temperature_response_ch4_total: + title: Change in global mean surface temperature caused by methane emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of methane. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by methane emissions + temperature_response_co2_fossil: + title: Change in global mean surface temperature caused by CO₂ emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions from fossil fuels and industry + temperature_response_co2_land: + title: Change in global mean surface temperature caused by CO₂ emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions from agriculture and land use + temperature_response_co2_total: + title: Change in global mean surface temperature caused by CO₂ emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by CO₂ emissions + temperature_response_n2o_fossil: + title: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions from fossil fuels and industry + temperature_response_n2o_land: + title: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use + unit: °C + short_unit: °C + description_short: *measured-in-celsius + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions from agriculture and land use + temperature_response_n2o_total: + title: Change in global mean surface temperature caused by nitrous oxide emissions + unit: °C + short_unit: °C + description_short: *measured-in-celsius + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Change in global mean surface temperature caused by nitrous oxide emissions + # Share of emissions (calculated by OWID), e.g. methane emissions as a percentage of global methane emissions. + # NOTE: Using CO2eq or tonnes of the original gas is irrelevant when calculated as a share of global emissions. + share_of_annual_emissions_ghg_total: + title: Share of global greenhouse gas emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's greenhouse gas emissions." + description_processing: *processing-greenhouse-gases + presentation: + title_public: Share of global greenhouse gas emissions + share_of_annual_emissions_ch4_total: + title: Share of global methane emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's methane emissions." + presentation: + title_public: Share of global methane emissions + share_of_annual_emissions_co2_total: + title: Share of global CO₂ emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's carbon dioxide emissions." + presentation: + title_public: Share of global CO₂ emissions + share_of_annual_emissions_n2o_total: + title: Share of global nitrous oxide emissions + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's nitrous oxide emissions." + presentation: + title_public: Share of global nitrous oxide emissions + # Share of global temperature change caused by greenhouse gas emissions from each country (calculated by OWID). + share_of_temperature_response_ghg_total: + title: Share of contribution to global warming + unit: "%" + short_unit: "%" + description_short: "Measured as a percentage of the world's temperature change." + description_key: + - This temperature change measures each country's contribution to global mean surface temperature (GMST) rise from its cumulative emissions of carbon dioxide, methane and nitrous oxide. + - The warming effects of each gas are calculated based on cumulative CO₂-equivalent emissions using the Global Warming Potential (GWP*) approach. + presentation: + title_public: Share of contribution to global warming + # Per capita emissions (calculated by OWID). + annual_emissions_co2_total_per_capita: + title: Per-capita CO₂ emissions + unit: tonnes + short_unit: t + description_short: *measured-in-tonnes-per-person + presentation: + title_public: Per-capita CO₂ emissions + annual_emissions_ch4_total_co2eq_per_capita: + title: Per-capita methane emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq-per-person + description_processing: *processing-methane + presentation: + title_public: Per-capita methane emissions + annual_emissions_n2o_total_co2eq_per_capita: + title: Per-capita nitrous oxide emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *measured-in-co2-eq-per-person + description_processing: *processing-nitrous-oxide + presentation: + title_public: Per-capita nitrous oxide emissions + annual_emissions_ghg_total_co2eq_per_capita: + title: Per-capita greenhouse gas emissions in CO₂ equivalents + unit: tonnes of CO₂ equivalents + short_unit: t + description_short: *ghg-emissions-per-person + description_processing: *processing-greenhouse-gases + presentation: + title_public: Per-capita greenhouse gas emissions diff --git a/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py new file mode 100644 index 00000000000..6ac00bafe70 --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py @@ -0,0 +1,354 @@ +"""Load a meadow dataset and create a garden dataset.""" + + +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table, Variable +from owid.datautils.dataframes import map_series + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor to change from teragrams to tonnes. +TERAGRAMS_TO_TONNES = 1e6 +# Conversion factor to change from petagrams to tonnes. +PETAGRAMS_TO_TONNES = 1e9 + +# Conversion factors to change from tonnes of gases emitted to tonnes of CO2 equivalents (taken from IPCC AR6). +CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS = 29.8 +CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS = 27.2 +N2O_EMISSIONS_TO_CO2_EQUIVALENTS = 273 + +# Gases and components expected to be in the data, and how to rename them. +GASES_RENAMING = { + "3-GHG": "ghg", + "CH[4]": "ch4", + "CO[2]": "co2", + "N[2]*O": "n2o", +} +COMPONENTS_RENAMING = { + "Fossil": "fossil", + "LULUCF": "land", + "Total": "total", +} + +# Columns for which we will create "share" variables, e.g. the percentage of methane emissions that a country produces +# in a year with respect to the world's methane emissions on the same year. +# NOTE: For this calculation, it doesn't matter if we use the total or the CO2-equivalent emissions. +SHARE_VARIABLES = [ + "annual_emissions_ch4_total", + "annual_emissions_co2_total", + "annual_emissions_n2o_total", + "annual_emissions_ghg_total_co2eq", + "temperature_response_ghg_total", +] + +# Columns for which a per-capita variable will be created. +PER_CAPITA_VARIABLES = [ + "annual_emissions_ch4_total_co2eq", + "annual_emissions_co2_total", + "annual_emissions_n2o_total_co2eq", + "annual_emissions_ghg_total_co2eq", +] + +# Regions to be added by aggregating data from their member countries. +REGIONS = { + # Default continents. + "Africa": {}, + "Asia": {}, + "Europe": {}, + "North America": {}, + "Oceania": {}, + "South America": {}, + # Income groups. + "Low-income countries": {}, + "Upper-middle-income countries": {}, + "Lower-middle-income countries": {}, + "High-income countries": {}, + # Additional composite regions. + "Asia (excl. China and India)": { + "additional_regions": ["Asia"], + "excluded_members": ["China", "India"], + }, + "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, + "Europe (excl. EU-28)": { + "additional_regions": ["Europe"], + "excluded_regions": ["European Union (27)"], + "excluded_members": ["United Kingdom"], + }, + "European Union (28)": { + "additional_regions": ["European Union (27)"], + "additional_members": ["United Kingdom"], + }, + "North America (excl. USA)": { + "additional_regions": ["North America"], + "excluded_members": ["United States"], + }, + # EU27 is already included in the original data. + # "European Union (27)": {}, +} + + +def run_sanity_checks_on_inputs(tb): + # Sanity checks. + error = "Names of gases have changed." + assert set(tb["gas"]) == set(GASES_RENAMING), error + error = "Names of components have changed." + assert set(tb["component"]) == set(COMPONENTS_RENAMING), error + error = "Units have changed." + assert set(tb["unit"]) == set( + ["Tg~CH[4]~year^-1", "Pg~CO[2]~year^-1", "Tg~N[2]*O~year^-1", "Pg~CO[2]*-e[100]", "°C"] + ), error + + +def add_kuwaiti_oil_fires_to_kuwait(tb: Table) -> Table: + tb = tb.copy() + + # NOTE: Use this function before harmonizing country names. Otherwise adapt the following definitions. + kuwait = "Kuwait" + oil_fires = "Kuwaiti Oil Fires" + + # Sanity check. + error = f"'{kuwait}' or '{oil_fires}' not found in the data." + assert kuwait in set(tb["country"]), error + assert oil_fires in set(tb["country"]), error + + # Add the emissions from the Kuwaiti oil fires (in 1991) to Kuwait. + tb_kuwait = tb[tb["country"] == kuwait].drop(columns="country").set_index("year") + tb_oil_fires = tb[tb["country"] == oil_fires].drop(columns="country").fillna(0).set_index(["year"]) + tb_combined = (tb_kuwait + tb_oil_fires).reset_index().assign(**{"country": kuwait}) + + # Replace the original data for Kuwait by the combined data. + tb_updated = pr.concat([tb[tb["country"] != kuwait].reset_index(drop=True), tb_combined], ignore_index=True) + + # Sort conveniently. + tb_updated = tb_updated.sort_values(["country", "year"]).reset_index(drop=True) + + return tb_updated + + +def add_emissions_in_co2_equivalents(tb: Table) -> Table: + # Add columns for fossil/land/total emissions of CH4 in terms of CO2 equivalents. + # NOTE: For methane, we apply different conversion factors for fossil and land-use emissions. + tb["annual_emissions_ch4_fossil_co2eq"] = ( + tb["annual_emissions_ch4_fossil"] * CH4_FOSSIL_EMISSIONS_TO_CO2_EQUIVALENTS + ) + tb["annual_emissions_ch4_land_co2eq"] = tb["annual_emissions_ch4_land"] * CH4_LAND_EMISSIONS_TO_CO2_EQUIVALENTS + tb["annual_emissions_ch4_total_co2eq"] = ( + tb["annual_emissions_ch4_fossil_co2eq"] + tb["annual_emissions_ch4_land_co2eq"] + ) + + # Add columns for fossil/land/total emissions of N2O in terms of CO2 equivalents. + # NOTE: For nitrous oxide, we apply the same conversion factors for fossil and land-use emissions. + for component in ["fossil", "land", "total"]: + tb[f"annual_emissions_n2o_{component}_co2eq"] = ( + tb[f"annual_emissions_n2o_{component}"] * N2O_EMISSIONS_TO_CO2_EQUIVALENTS + ) + + # Add columns for fossil/land/total emissions of all GHG in terms of CO2 equivalents. + # NOTE: The file of annual emissions does not include GHG emissions, which is why we need to add them now. + # However, the files of temperature response and cumulative emissions do include GHG emissions. + for component in ["fossil", "land", "total"]: + tb[f"annual_emissions_ghg_{component}_co2eq"] = ( + tb[f"annual_emissions_co2_{component}"] + + tb[f"annual_emissions_ch4_{component}_co2eq"] + + tb[f"annual_emissions_n2o_{component}_co2eq"] + ) + + return tb + + +def add_share_variables(tb: Table) -> Table: + tb = tb.copy() + + # Create "share" variables (percentages with respect to global). + # To do that, first create a separate table for global data, and add it to the main table. + tb_global = tb[tb["country"] == "World"][["year"] + SHARE_VARIABLES].reset_index(drop=True) + + tb = tb.merge(tb_global, on=["year"], how="left", suffixes=("", "_global")) + # For a list of variables, add the percentage with respect to global. + for variable in SHARE_VARIABLES: + new_variable = f"share_of_{variable.replace('_co2eq', '')}" + tb[new_variable] = 100 * tb[variable] / tb[f"{variable}_global"] + + # Drop unnecessary columns for global data. + tb = tb.drop(columns=[column for column in tb.columns if column.endswith("_global")], errors="raise") + + return tb + + +def add_per_capita_variables(tb: Table, ds_population: Dataset) -> Table: + tb = tb.copy() + + # Add population to data. + tb = geo.add_population_to_table( + tb=tb, + ds_population=ds_population, + warn_on_missing_countries=False, + ) + + # Add per-capita variables. + for variable in PER_CAPITA_VARIABLES: + tb[f"{variable}_per_capita"] = tb[variable] / tb["population"] + + # Drop population column. + tb = tb.drop(columns="population", errors="raise") + + return tb + + +def fix_emissions_jump_in_1850(tb: Table) -> Table: + # There is data from 1830 for some variables and from 1850 for others. + # However, when inspecting data between 1830 and 1850 (e.g. annual_emissions_co2_total) there is an abrupt jump + # between 1849 and 1850, which happens for many countries (e.g. Spain, or World). + # This jump seems to be spurious, and therefore we start all time series from 1850. + + # First check that the jump is still in the data. + emissions_before_jump = tb[(tb["country"] == "World") & (tb["year"] == 1849)]["annual_emissions_co2_total"].item() + emissions_after_jump = tb[(tb["country"] == "World") & (tb["year"] == 1850)]["annual_emissions_co2_total"].item() + error = "Spurious jump between 1849 and 1850 is not in the data anymore. Remove this part of the code." + assert emissions_after_jump / emissions_before_jump > 10, error + + # Visually inspect the jump. + # import plotly.express as px + # px.line(tb[tb["country"]=="World"], x="year", y="annual_emissions_co2_total", markers=True) + + # Start all data after the jump. + tb = tb[tb["year"] >= 1850].reset_index(drop=True) + + return tb + + +def run_sanity_checks_on_outputs(tb: Table) -> None: + error = "Share of global emissions cannot be larger than 101%" + assert (tb[[column for column in tb.columns if "share" in column]].max() < 101).all(), error + error = "Share of global emissions was not expected to be smaller than -1%" + # Some countries did contribute negatively to CO2 emissions, however overall the negative contribution is always + # smaller than 1% in absolute value. + assert (tb[[column for column in tb.columns if "share" in column]].min() > -1).all(), error + + # Ensure that no country contributes to emissions more than the entire world. + columns_that_should_be_smaller_than_global = [ + column for column in tb.drop(columns=["country", "year"]).columns if "capita" not in column + ] + tb_global = tb[tb["country"] == "World"].drop(columns="country") + check = pr.merge( + tb[tb["country"] != "World"].reset_index(drop=True), tb_global, on="year", how="left", suffixes=("", "_global") + ) + for column in columns_that_should_be_smaller_than_global: + # It is in principle possible that some region would emit more than the world, if the rest of regions + # were contributing with negative CO2 emissions (e.g. High-income countries in 1854). + # However, the difference should be very small. + error = f"Region contributed to {column} more than the entire world." + assert check[(check[column] - check[f"{column}_global"]) / check[f"{column}_global"] > 0.00001].empty, error + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("national_contributions") + tb = ds_meadow["national_contributions"].reset_index() + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # + # Process data. + # + # Sanity checks. + run_sanity_checks_on_inputs(tb=tb) + + # Rename gases and components. + tb["gas"] = Variable( + map_series( + series=tb["gas"], mapping=GASES_RENAMING, warn_on_missing_mappings=True, warn_on_unused_mappings=True + ) + ).copy_metadata(tb["gas"]) + tb["component"] = Variable( + map_series( + series=tb["component"], + mapping=COMPONENTS_RENAMING, + warn_on_missing_mappings=True, + warn_on_unused_mappings=True, + ) + ).copy_metadata(tb["component"]) + + # Convert units from teragrams and petagrams to tonnes. + tb.loc[tb["unit"].str.startswith("Tg"), "data"] *= TERAGRAMS_TO_TONNES + tb.loc[tb["unit"].str.startswith("Pg"), "data"] *= PETAGRAMS_TO_TONNES + + # Transpose data. + tb = tb.pivot( + index=["country", "year"], columns=["file", "gas", "component"], values="data", join_column_levels_with="_" + ) + + # We add the emissions from the Kuwaiti oil fires in 1991 (which are also included as a separate country) as part + # of the emissions of Kuwait. + # This ensures that these emissions will be included in aggregates of regions that include Kuwait. + tb = add_kuwaiti_oil_fires_to_kuwait(tb=tb) + + # Harmonize country names. + tb = geo.harmonize_countries( + tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + ) + + # Replace spurious negative values with zeros (and ensure they are small numbers, within the uncertainty). + columns_that_cannot_be_negative = [column for column in tb.columns if "fossil" in column] + #################################################################################################################### + # TODO: For some reason, cumulative_emissions_ch4_fossil (and therefore cumulative_emissions_ghg_fossil) have + # big negative values. For example for Ireland's value in 2022 is of -2.93e+08! + # I will look into this, but, for now, I'll ignore those negative values (we are not using these indicators in + # any chart). + columns_that_cannot_be_negative = [ + column + for column in columns_that_cannot_be_negative + if column not in ["cumulative_emissions_ch4_fossil", "cumulative_emissions_ghg_fossil"] + ] + #################################################################################################################### + for column in columns_that_cannot_be_negative: + # Ensure all negative values are just numerical noise. + assert (tb[column].fillna(0) >= -2e-4).all() + # Replace those values by zero. + tb[column] = tb[column].clip(lower=0) + + # Add region aggregates. + tb = geo.add_regions_to_table( + tb=tb, ds_regions=ds_regions, ds_income_groups=ds_income_groups, regions=REGIONS, min_num_values_per_year=1 + ) + + # Add columns for emissions in terms of CO2 equivalents. + tb = add_emissions_in_co2_equivalents(tb=tb) + + # Add "share" variables (percentages with respect to global emissions). + tb = add_share_variables(tb=tb) + + # Add per-capita variables. + tb = add_per_capita_variables(tb=tb, ds_population=ds_population) + + # Fix spurious jump in the data in 1850. + tb = fix_emissions_jump_in_1850(tb=tb) + + # Sanity checks. + run_sanity_checks_on_outputs(tb=tb) + + # Set an appropriate index and sort conveniently. + tb = tb.format() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml b/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml new file mode 100644 index 00000000000..d58145ee7f5 --- /dev/null +++ b/etl/steps/data/garden/emissions/latest/owid_co2.meta.yml @@ -0,0 +1,9 @@ +dataset: + title: OWID CO2 dataset + description: | + OWID CO2 dataset. + + This dataset will be loaded by [the co2-data repository](https://github.com/owid/co2-data), to create a csv file of the dataset that can be downloaded in one click. + +# Dataset sources will be created in the step by combining all component datasets' sources. +# Also, table metadata will be built from the tables' original metadata. diff --git a/etl/steps/data/garden/emissions/latest/owid_co2.py b/etl/steps/data/garden/emissions/latest/owid_co2.py new file mode 100644 index 00000000000..33dd14f123c --- /dev/null +++ b/etl/steps/data/garden/emissions/latest/owid_co2.py @@ -0,0 +1,360 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. + +Datasets combined: +* Global Carbon Budget - Global Carbon Project. +* National contributions to climate change - Jones et al. +* Greenhouse gas emissions by sector - Climate Watch. +* Primary energy consumption - EI & EIA. + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2020) on +GDP are included. + +""" + + +import numpy as np +from owid.catalog import Dataset, Origin, Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor from tonnes to million tonnes. +TONNES_TO_MILLION_TONNES = 1e-6 + +# Select columns to use from each dataset, and how to rename them. +GCP_COLUMNS = { + "country": "country", + "year": "year", + "emissions_total": "co2", + "emissions_total_per_capita": "co2_per_capita", + "traded_emissions": "trade_co2", + "emissions_from_cement": "cement_co2", + "emissions_from_cement_per_capita": "cement_co2_per_capita", + "emissions_from_coal": "coal_co2", + "emissions_from_coal_per_capita": "coal_co2_per_capita", + "emissions_from_flaring": "flaring_co2", + "emissions_from_flaring_per_capita": "flaring_co2_per_capita", + "emissions_from_gas": "gas_co2", + "emissions_from_gas_per_capita": "gas_co2_per_capita", + "emissions_from_oil": "oil_co2", + "emissions_from_oil_per_capita": "oil_co2_per_capita", + "emissions_from_other_industry": "other_industry_co2", + "emissions_from_other_industry_per_capita": "other_co2_per_capita", + "pct_growth_emissions_total": "co2_growth_prct", + "growth_emissions_total": "co2_growth_abs", + "emissions_total_per_gdp": "co2_per_gdp", + "emissions_total_per_unit_energy": "co2_per_unit_energy", + "consumption_emissions": "consumption_co2", + "consumption_emissions_per_capita": "consumption_co2_per_capita", + "consumption_emissions_per_gdp": "consumption_co2_per_gdp", + "cumulative_emissions_total": "cumulative_co2", + "cumulative_emissions_from_cement": "cumulative_cement_co2", + "cumulative_emissions_from_coal": "cumulative_coal_co2", + "cumulative_emissions_from_flaring": "cumulative_flaring_co2", + "cumulative_emissions_from_gas": "cumulative_gas_co2", + "cumulative_emissions_from_oil": "cumulative_oil_co2", + "cumulative_emissions_from_other_industry": "cumulative_other_co2", + "pct_traded_emissions": "trade_co2_share", + "emissions_total_as_share_of_global": "share_global_co2", + "emissions_from_cement_as_share_of_global": "share_global_cement_co2", + "emissions_from_coal_as_share_of_global": "share_global_coal_co2", + "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", + "emissions_from_gas_as_share_of_global": "share_global_gas_co2", + "emissions_from_oil_as_share_of_global": "share_global_oil_co2", + "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", + "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", + "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", + "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", + "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", + "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", + "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", + "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", + # New variables, related to land-use change emissions. + "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", + "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", + "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", + "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", + "emissions_from_land_use_change": "land_use_change_co2", + "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", + "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", + "emissions_total_including_land_use_change": "co2_including_luc", + "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", + "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", + "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", + "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", + "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", + "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", +} +JONES_COLUMNS = { + "country": "country", + "year": "year", + "temperature_response_co2_total": "temperature_change_from_co2", + "temperature_response_ghg_total": "temperature_change_from_ghg", + "temperature_response_ch4_total": "temperature_change_from_ch4", + "temperature_response_n2o_total": "temperature_change_from_n2o", + "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", +} +CLIMATE_WATCH_GHG_COLUMNS = { + "country": "country", + "year": "year", + "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf", + "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita", + "total_ghg_emissions_including_lucf": "total_ghg", + "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita", +} +CLIMATE_WATCH_CH4_COLUMNS = { + "country": "country", + "year": "year", + "total_ch4_emissions_including_lucf": "methane", + "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita", +} +CLIMATE_WATCH_N2O_COLUMNS = { + "country": "country", + "year": "year", + "total_n2o_emissions_including_lucf": "nitrous_oxide", + "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita", +} +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", + "primary_energy_consumption_per_capita__kwh": "energy_per_capita", + "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", +} +REGIONS_COLUMNS = { + "name": "country", + "iso_alpha3": "iso_code", +} +POPULATION_COLUMNS = { + "country": "country", + "year": "year", + "population": "population", +} +GDP_COLUMNS = { + "country": "country", + "year": "year", + "gdp": "gdp", +} + +UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}} + + +def convert_units(table: Table) -> Table: + """Convert units of table. + + Parameters + ---------- + table : Table + Data with its original units. + + Returns + ------- + Table + Data after converting units of specific columns. + + """ + table = table.copy() + # Check units and convert to more convenient ones. + for column in table.columns: + unit = table[column].metadata.unit + title = table[column].metadata.title + description_short = table[column].metadata.description or table[column].metadata.description_short + if unit in list(UNITS): + table[column] *= UNITS[unit]["conversion"] + table[column].metadata.unit = UNITS[unit]["new_unit"] + table[column].metadata.short_unit = UNITS[unit]["new_short_unit"] + table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"]) + table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"]) + + return table + + +def combine_tables( + tb_gcp: Table, + tb_jones: Table, + tb_climate_watch_ghg: Table, + tb_climate_watch_ch4: Table, + tb_climate_watch_n2o: Table, + tb_energy: Table, + tb_gdp: Table, + tb_population: Table, + tb_regions: Table, +) -> Table: + """Combine tables. + + Parameters + ---------- + tb_gcp : Table + Global Carbon Budget table (from Global Carbon Project). + tb_jones : Table + National contributions to climate change (from Jones et al. (2023)). + tb_climate_watch_ghg : Table + Greenhouse gas emissions table (from Climate Watch). + tb_climate_watch_ch4 : Table + CH4 emissions table (from Climate Watch). + tb_climate_watch_n2o : Table + N2O emissions table (from Climate Watch). + tb_energy : Table + Primary energy consumption table (from BP & EIA). + tb_gdp : Table + Maddison GDP table (from GGDC). + tb_population : Table + OWID population table (from various sources). + tb_regions : Table + OWID regions table. + + Returns + ------- + combined : Table + Combined table with metadata and variables metadata. + + """ + # Combine main tables (with an outer join, to gather all entities from all tables). + combined = tb_gcp.copy() + for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]: + combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Add secondary tables (with a left join, to keep only entities for which we have emissions data). + for table in [tb_energy, tb_gdp, tb_population]: + combined = combined.merge(table, on=["country", "year"], how="left") + + # Countries-regions dataset does not have a year column, so it has to be merged on country. + combined = combined.merge(tb_regions, on="country", how="left") + + # Check that there were no repetition in column names. + error = "Repeated columns in combined data." + assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error + + # Adjust units. + combined = convert_units(combined) + + return combined + + +def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table: + """Clean and prepare output table. + + Parameters + ---------- + combined : Table + Combined table. + ds_regions : Dataset + Regions dataset, only used to get its version. + + Returns + ------- + combined: Table + Cleaned combined table. + + """ + # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). + columns_that_must_have_data = [ + column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] + ] + combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) + + # Add metadata to the ISO column (loaded from the regions dataset). + combined["iso_code"].m.origins = [ + Origin( + producer="International Organization for Standardization", + title="Regions", + date_published=ds_regions.version, + ) + ] + combined["iso_code"].metadata.title = "ISO code" + combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes." + combined["iso_code"].metadata.unit = "" + + # Sanity check. + columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] + assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" + + # Set index and sort conveniently. + combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() + + return combined + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the global carbon budget dataset from the Global Carbon Project (GCP). + ds_gcp = paths.load_dataset("global_carbon_budget") + + # Load the Jones et al. (2023) dataset on national contributions to climate change. + ds_jones = paths.load_dataset("national_contributions") + + # Load the greenhouse gas emissions by sector dataset by Climate Watch. + ds_climate_watch = paths.load_dataset("emissions_by_sector") + + # Load the GDP dataset by GGDC Maddison. + ds_gdp = paths.load_dataset("ggdc_maddison") + + # Load primary energy consumption dataset (by different sources in our 'energy' namespace). + ds_energy = paths.load_dataset("primary_energy_consumption") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load countries-regions dataset (required to get ISO codes). + ds_regions = paths.load_dataset("regions") + + # Gather all required tables from all datasets. + tb_gcp = ds_gcp["global_carbon_budget"] + tb_jones = ds_jones["national_contributions"] + tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"] + tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"] + tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"] + tb_energy = ds_energy["primary_energy_consumption"] + tb_gdp = ds_gdp["maddison_gdp"] + tb_population = ds_population["population"] + tb_regions = ds_regions["regions"] + + # + # Process data. + # + # Choose required columns and rename them. + tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") + tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") + tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename( + columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise" + ) + tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename( + columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise" + ) + tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename( + columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise" + ) + tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( + columns=PRIMARY_ENERGY_COLUMNS, errors="raise" + ) + tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") + tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( + columns=POPULATION_COLUMNS, errors="raise" + ) + tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") + + # Combine tables. + combined = combine_tables( + tb_gcp=tb_gcp, + tb_jones=tb_jones, + tb_climate_watch_ghg=tb_climate_watch_ghg, + tb_climate_watch_ch4=tb_climate_watch_ch4, + tb_climate_watch_n2o=tb_climate_watch_n2o, + tb_energy=tb_energy, + tb_gdp=tb_gdp, + tb_population=tb_population, + tb_regions=tb_regions, + ) + + # Prepare outputs. + combined = prepare_outputs(combined=combined, ds_regions=ds_regions) + + # + # Save outputs. + # + ds_garden = create_dataset(dest_dir, tables=[combined], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml new file mode 100644 index 00000000000..933924e021d --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.meta.yml @@ -0,0 +1,30 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: major + description_key: + - Based on ice core studies of historical concentration of greenhouse gases, and recent air monitoring sites around the world. + description_from_producer: |- + This indicator describes how the levels of major greenhouse gases (GHGs) in the atmosphere have changed over geological time and in recent years. Changes in atmospheric GHGs, in part caused by human activities, affect the amount of energy held in the Earth-atmosphere system and thus affect the Earth's climate. This indicator is highly relevant to climate change because greenhouse gases from human activities are the primary driver of observed climate change since the mid-20th century (IPCC, 2021). + +dataset: + update_period_days: 0 + +tables: + ghg_concentration: + title: Global Atmospheric Concentrations of Greenhouse Gases + variables: + co2_concentration: + title: Global atmospheric concentration of carbon dioxide + unit: parts per million + short_unit: ppm + ch4_concentration: + title: Global atmospheric concentration of methane + unit: parts per billion + short_unit: ppb + n2o_concentration: + title: Global atmospheric concentration of nitrous oxide + unit: parts per billion + short_unit: ppb diff --git a/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py new file mode 100644 index 00000000000..e244a717be8 --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ghg_concentration.py @@ -0,0 +1,75 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to read from the data, and how to rename them. +COLUMNS_CO2 = { + "year": "year", + "antarctic_ice_cores": "co2_concentration", +} +COLUMNS_CH4 = { + "year": "year", + "epica_dome_c__antarctica": "ch4_concentration", +} +COLUMNS_N2O = { + "year": "year", + "epica_dome_c__antarctica": "n2o_concentration", +} + + +def approximate_data_for_each_year(tb: Table, column: str) -> Table: + tb = tb.copy() + + # Round each year to its closer integer. + tb["year"] = tb["year"].round(0).astype(int) + + # If there are multiple rows for a given year, take the average value. + tb = tb.groupby("year", as_index=False).agg({column: "mean"}) + + # Remove empty rows. + tb = tb.dropna(subset=[column]).reset_index(drop=True) + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its tables. + ds_meadow = paths.load_dataset("ghg_concentration") + tb_co2 = ds_meadow["co2_concentration"].reset_index() + tb_ch4 = ds_meadow["ch4_concentration"].reset_index() + tb_n2o = ds_meadow["n2o_concentration"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb_co2 = tb_co2[list(COLUMNS_CO2)].rename(columns=COLUMNS_CO2, errors="raise") + tb_ch4 = tb_ch4[list(COLUMNS_CH4)].rename(columns=COLUMNS_CH4, errors="raise") + tb_n2o = tb_n2o[list(COLUMNS_N2O)].rename(columns=COLUMNS_N2O, errors="raise") + + # Since pandas datetime cannot handle such long past dates, for simplicity, round up years, and take average + # concentration of year for which there are multiple rows. + tb_co2 = approximate_data_for_each_year(tb_co2, "co2_concentration") + tb_ch4 = approximate_data_for_each_year(tb_ch4, "ch4_concentration") + tb_n2o = approximate_data_for_each_year(tb_n2o, "n2o_concentration") + + # Combine data for all gases. + tb = tb_co2.merge(tb_ch4, on="year", how="outer").merge(tb_n2o, on="year", how="outer", short_name=paths.short_name) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml new file mode 100644 index 00000000000..d7791eb36f8 --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.meta.yml @@ -0,0 +1,31 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: major + unit: billion tonnes + short_unit: billion t + description_key: + - Values are centered at zero in 2002 to provide a consistent point of reference. + - A downward slope indicates a net loss of ice and snow. + - For reference, 1,000 billion metric tons is enough to raise sea level by about 3 millimeters. + +dataset: + title: Ice Sheet Mass Balance + update_period_days: 0 + +tables: + ice_sheet_mass_balance: + title: Ice Sheet Mass Balance + variables: + cumulative_ice_mass_change_imbie: + title: Cumulative change in mass in the ice sheets, according to IMBIE + description_short: Measured in billion tonnes. Based on more than 20 different studies that have been combined for each region. + presentation: + title_variant: IMBIE + land_ice_mass_nasa: + title: Cumulative change in mass in the ice sheets, according to NASA/JPL + description_short: Measured in billion tonnes. + presentation: + title_variant: NASA/JPL diff --git a/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py new file mode 100644 index 00000000000..8c03e21269c --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ice_sheet_mass_balance.py @@ -0,0 +1,91 @@ +"""Load a meadow dataset and create a garden dataset.""" + +import owid.catalog.processing as pr +import pandas as pd +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def fix_multiple_rows_for_the_same_year(tb: Table) -> Table: + # There are repeated years, but there are no ambiguities (i.e. for each column, either the first or the second + # repeated year has data, not both). + # To fix that, remove nans from each column and merge them together. + tb_corrected = tb[["year"]].drop_duplicates().reset_index(drop=True) + for column in tb.columns[1:]: + tb_column = tb[["year", column]].dropna().reset_index(drop=True) + assert tb_column[tb_column.duplicated(subset="year", keep=False)].empty + tb_corrected = tb_corrected.merge(tb_column, how="outer", on="year") + + return tb_corrected + + +def decimal_date_to_date(year: int) -> str: + return (pd.to_datetime(year, format="%Y") + pd.Timedelta(days=(year % 1) * 364.2425)).date() + + +def separate_antarctica_and_greenland_data(tb: Table) -> Table: + columns_antarctica = { + "date": "date", + "nasa__antarctica_land_ice_mass": "land_ice_mass_nasa", + "imbie__antarctica_cumulative_ice_mass_change": "cumulative_ice_mass_change_imbie", + } + tb_antarctica = ( + tb[list(columns_antarctica)] + .rename(columns=columns_antarctica, errors="raise") + .assign(**{"location": "Antarctica"}) + .copy() + ) + columns_greenland = { + "date": "date", + "nasa__greenland_land_ice_mass": "land_ice_mass_nasa", + "imbie__greenland_cumulative_ice_mass_change": "cumulative_ice_mass_change_imbie", + } + tb_greenland = ( + tb[list(columns_greenland)] + .rename(columns=columns_greenland, errors="raise") + .assign(**{"location": "Greenland"}) + .copy() + ) + + # Combine data for Antarctica and Greenland. + tb_combined = pr.concat([tb_antarctica, tb_greenland], ignore_index=True) + + return tb_combined + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("ice_sheet_mass_balance") + tb = ds_meadow["ice_sheet_mass_balance"].reset_index() + + # + # Process data. + # + # Fix issue with the original data, where there are multiple rows for the same year. + tb = fix_multiple_rows_for_the_same_year(tb=tb) + + # Remove empty rows. + tb = tb.dropna(how="all") + + # Create a date column (given that "year" is given with decimals). + tb["date"] = tb["year"].apply(decimal_date_to_date).astype(str) + + # Separate data for Antarctica and Greenland. + tb = separate_antarctica_and_greenland_data(tb=tb) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "date"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml new file mode 100644 index 00000000000..db21db39b00 --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.meta.yml @@ -0,0 +1,17 @@ +dataset: + update_period_days: 0 + +tables: + mass_balance_us_glaciers: + title: Mass Balance of Glaciers in the United States + variables: + mass_balance_us_glaciers: + title: Cumulative mass balance + unit: meters + short_unit: m + description_short: |- + Measured in meters of water equivalent, which represent changes in the average thickness of a glacier relative to a base year 1965. + presentation: + topic_tags: + - Climate Change + processing_level: minor diff --git a/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py new file mode 100644 index 00000000000..87faf60c2cf --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/mass_balance_us_glaciers.py @@ -0,0 +1,39 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("mass_balance_us_glaciers") + tb = ds_meadow["mass_balance_us_glaciers"].reset_index() + + # + # Process data. + # + # Change column names to human-readable names. + tb = tb.rename( + columns={column: column.replace("_", " ").title() for column in tb.columns if column != "year"}, errors="raise" + ) + + # Transpose table to have location as a column. + tb = tb.melt(id_vars=["year"], var_name="location", value_name="mass_balance_us_glaciers") + + # Remove empty rows. + tb = tb.dropna().reset_index(drop=True) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml new file mode 100644 index 00000000000..0eec41a7f0e --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.meta.yml @@ -0,0 +1,34 @@ +definitions: + common: + presentation: + topic_tags: + - Climate Change + processing_level: minor + description_processing: |- + The amount of heat in the ocean, or ocean heat content, is an important indicator of climate change because the oceans ultimately absorb a large portion of the extra energy that greenhouse gases trap near the Earth's surface. Ocean heat content also plays an important role in the Earth's climate system because heat from ocean surface waters provides energy for storms and thereby influences weather patterns. + description_short: Measured in 10²² Joules. + unit: 10²² Joules + short_unit: 10²² J + +dataset: + title: Ocean Heat Content + update_period_days: 0 + +tables: + ocean_heat_content: + title: Ocean Heat Content + variables: + ocean_heat_content_iap_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer, according to IAP + ocean_heat_content_noaa_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer, according to NOAA + ocean_heat_content_mri_2000m: + title: Annual average ocean heat content for the 0-2000 meters layer, according to MRI/JMA + ocean_heat_content_mri_700m: + title: Annual average ocean heat content for the 0-700 meters layer, according to MRI/JMA + ocean_heat_content_noaa_700m: + title: Annual average ocean heat content for the 0-700 meters layer, according to NOAA + ocean_heat_content_iap_700m: + title: Annual average ocean heat content for the 0-700 meters layer, according to IAP + ocean_heat_content_csiro_700m: + title: Annual average ocean heat content for the 0-700 meters layer, according to CSIRO diff --git a/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py new file mode 100644 index 00000000000..df33d15ae7e --- /dev/null +++ b/etl/steps/data/garden/epa/2024-04-17/ocean_heat_content.py @@ -0,0 +1,35 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("ocean_heat_content") + tb = ds_meadow["ocean_heat_content"].reset_index() + + # + # Process data. + # + # Instead of having a column for depth, create columns of heat content for each depth. + tb["depth"] = tb["depth"].astype(str) + "m" + tb = tb.pivot(index=["location", "year"], columns="depth", join_column_levels_with="_") + + # Delete columns with no data. + tb = tb.dropna(how="all", axis=1).reset_index(drop=True) + + # Set an appropriate index to each table and sort conveniently. + tb = tb.set_index(["location", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True) + ds_garden.save() diff --git a/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py b/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py index 380500d9c2f..4a940cfef89 100644 --- a/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py +++ b/etl/steps/data/garden/excess_mortality/latest/xm_karlinsky_kobak.py @@ -262,7 +262,9 @@ def add_uk_by_age(df: pd.DataFrame): time_units = df_uk["time_unit"].unique() assert len(time_units) == 1, "There are multiple time units for UK Nations" # Estimate metrics - df_uk = df_uk.groupby(["year", "time", "age"], as_index=False).sum(min_count=3) + df_uk = ( + df_uk.drop(columns=["entity", "time_unit"]).groupby(["year", "time", "age"], as_index=False).sum(min_count=3) + ) # Reassign entity name and time unit df_uk["entity"] = "United Kingdom" df_uk["time_unit"] = time_units[0] diff --git a/etl/steps/data/garden/faostat/2022-05-17/shared.py b/etl/steps/data/garden/faostat/2022-05-17/shared.py index 2422e17b7c4..d7fb893e2cc 100644 --- a/etl/steps/data/garden/faostat/2022-05-17/shared.py +++ b/etl/steps/data/garden/faostat/2022-05-17/shared.py @@ -1366,7 +1366,7 @@ def convert_variables_given_per_capita_to_total_value( # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into # total (non-per-capita) values. element_codes_that_were_per_capita = list( - elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: data = data.copy() @@ -1417,7 +1417,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( - elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: log.info("add_per_capita_variables", shape=data.shape) diff --git a/etl/steps/data/garden/faostat/2023-02-22/shared.py b/etl/steps/data/garden/faostat/2023-02-22/shared.py index 6f18800d737..120f7f476df 100644 --- a/etl/steps/data/garden/faostat/2023-02-22/shared.py +++ b/etl/steps/data/garden/faostat/2023-02-22/shared.py @@ -1304,7 +1304,7 @@ def convert_variables_given_per_capita_to_total_value( # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into # total (non-per-capita) values. element_codes_that_were_per_capita = list( - elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: data = data.copy() @@ -1355,7 +1355,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( - elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: log.info("add_per_capita_variables", shape=data.shape) diff --git a/etl/steps/data/garden/faostat/2023-06-12/shared.py b/etl/steps/data/garden/faostat/2023-06-12/shared.py index 1953069445b..9c6774e9f77 100644 --- a/etl/steps/data/garden/faostat/2023-06-12/shared.py +++ b/etl/steps/data/garden/faostat/2023-06-12/shared.py @@ -1314,7 +1314,7 @@ def convert_variables_given_per_capita_to_total_value( # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into # total (non-per-capita) values. element_codes_that_were_per_capita = list( - elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: data = data.copy() @@ -1365,7 +1365,7 @@ def add_per_capita_variables(data: pd.DataFrame, elements_metadata: pd.DataFrame # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( - elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: log.info("add_per_capita_variables", shape=data.shape) diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py index 17514fea34c..03c0c45e48b 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_metadata.py @@ -498,7 +498,9 @@ def create_elements_table_for_domain(table: Table, metadata: Dataset, dataset_sh .sort_values(["fao_unit_short_name"]) .reset_index(drop=True) ) - elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna(elements_from_data["fao_unit_short_name"]) + elements_from_data["fao_unit"] = elements_from_data["fao_unit"].fillna( + elements_from_data["fao_unit_short_name"].astype(object) + ) # Sanity checks: diff --git a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py index 3c64dab36c9..6e8e4687417 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py +++ b/etl/steps/data/garden/faostat/2024-03-14/faostat_qcl.py @@ -1,6 +1,5 @@ """FAOSTAT garden step for faostat_qcl dataset.""" -from pathlib import Path import numpy as np import owid.catalog.processing as pr @@ -31,18 +30,39 @@ ITEM_CODE_MEAT_POULTRY = "00001808" # Item code for 'Meat, chicken'. ITEM_CODE_MEAT_CHICKEN = "00001058" -# List item codes to sum as part of "Meat, total" (avoiding double-counting items). +# Given that the number of slaughtered animals to produce all meat is not provided, we estimate it by aggregating the +# number of slaughtered animals for each meat item. +# List item codes to sum as part of "Meat, Total" (avoiding double-counting items). +# This list can be found following these steps: +# * Go to: https://www.fao.org/faostat/en/#definitions +# * Click on "Item Group" (on the left column). +# * Type "1765" in the search bar (which is the Item Group Code corresponding to Item Group "Meat, Total"). +# * Download the output of the search as a CSV file. +# * Open the file, filter by Item Group Code "1765" and Domain Code "QCL". +# * The list of item codes and item names are in columns "Item Code" and "Item". MEAT_TOTAL_ITEM_CODES = [ - "00000977", # 'Meat, lamb and mutton' (previously 'Meat, lamb and mutton') - "00001035", # 'Meat of pig with the bone, fresh or chilled' (previously 'Meat, pig') - "00001097", # 'Horse meat, fresh or chilled' (previously 'Meat, horse') - "00001108", # 'Meat of asses, fresh or chilled' (previously 'Meat, ass') - "00001111", # 'Meat of mules, fresh or chilled' (previously 'Meat, mule') - "00001127", # 'Meat of camels, fresh or chilled' (previously 'Meat, camel') - "00001141", # 'Meat of rabbits and hares, fresh or chilled' (previously 'Meat, rabbit') - "00001806", # 'Meat, beef and buffalo' (previously 'Meat, beef and buffalo') - "00001807", # 'Meat, sheep and goat' (previously 'Meat, sheep and goat') - ITEM_CODE_MEAT_POULTRY, # 'Meat, poultry' (previously 'Meat, poultry') + "00001058", # 'Meat of chickens, fresh or chilled', + "00001069", # 'Meat of ducks, fresh or chilled', + "00001035", # 'Meat of pig with the bone, fresh or chilled', + "00001017", # 'Meat of goat, fresh or chilled', + "00000977", # 'Meat of sheep, fresh or chilled', + "00000867", # 'Meat of cattle with the bone, fresh or chilled', + "00000947", # 'Meat of buffalo, fresh or chilled', + "00001127", # 'Meat of camels, fresh or chilled', + "00001097", # 'Horse meat, fresh or chilled', + "00001080", # 'Meat of turkeys, fresh or chilled', + "00001141", # 'Meat of rabbits and hares, fresh or chilled', + "00001163", # 'Game meat, fresh, chilled or frozen', + "00001108", # 'Meat of asses, fresh or chilled', + "00001073", # 'Meat of geese, fresh or chilled', + "00001111", # 'Meat of mules, fresh or chilled', + "00001166", # 'Other meat n.e.c. (excluding mammals), fresh, chilled or frozen', + "00001158", # 'Meat of other domestic camelids, fresh or chilled', + "00001151", # 'Meat of other domestic rodents, fresh or chilled', + "00001089", # 'Meat of pigeons and other birds n.e.c., fresh, chilled or frozen', + "00001176", # 'Snails, fresh, chilled, frozen, dried, salted or in brine, except sea snails', + # Items that were in the list of "Meat, Total", but were not in the data: + # "00001083", # 'Other birds', ] # List of element codes for "Producing or slaughtered animals" (they have different items assigned). @@ -163,17 +183,10 @@ def add_slaughtered_animals_to_meat_total(tb: Table) -> Table: error = f"Some items required to get the aggregate '{TOTAL_MEAT_ITEM}' are missing in data." assert set(MEAT_TOTAL_ITEM_CODES) < set(tb["item_code"]), error - assert SLAUGHTERED_ANIMALS_ELEMENT in tb["element"].unique() - assert SLAUGHTERED_ANIMALS_UNIT in tb["unit"].unique() + assert SLAUGHTERED_ANIMALS_ELEMENT in set(tb["element"]) + assert SLAUGHTERED_ANIMALS_UNIT in set(tb["unit"]) - # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data. - assert tb[ - (tb["item"] == TOTAL_MEAT_ITEM) - & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) - & (tb["unit"] == SLAUGHTERED_ANIMALS_UNIT) - ].empty - - # There are two element codes for the same element (they have different items assigned). + # Check that there are two element codes for the same element (they have different items assigned). error = "Element codes for 'Producing or slaughtered animals' may have changed." assert ( tb[(tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT) & ~(tb["element_code"].str.contains("pc"))]["element_code"] @@ -182,6 +195,13 @@ def add_slaughtered_animals_to_meat_total(tb: Table) -> Table: == SLAUGHTERED_ANIMALS_ELEMENT_CODES ), error + # Check that they use the same unit. + error = "Unit for element 'Producing or slaughtered animals' may have changed." + assert set(tb[(tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT)]["unit"]) == set(["animals"]), error + + # Check that, indeed, the number of slaughtered animals for total meat is not given in the original data. + assert tb[(tb["item"] == TOTAL_MEAT_ITEM) & (tb["element"] == SLAUGHTERED_ANIMALS_ELEMENT)].empty + # Check that the items assigned to each the two element codes do not overlap. error = "Element codes for 'Producing or slaughtered animals' have overlapping items." items_for_different_elements = ( @@ -437,7 +457,7 @@ def run(dest_dir: str) -> None: # Load data. # # Fetch the dataset short name from dest_dir. - dataset_short_name = Path(dest_dir).name + dataset_short_name = f"{NAMESPACE}_qcl" # Define path to current step file. current_step_file = (CURRENT_DIR / dataset_short_name).with_suffix(".py") diff --git a/etl/steps/data/garden/faostat/2024-03-14/shared.py b/etl/steps/data/garden/faostat/2024-03-14/shared.py index ec239660fea..9377889c115 100644 --- a/etl/steps/data/garden/faostat/2024-03-14/shared.py +++ b/etl/steps/data/garden/faostat/2024-03-14/shared.py @@ -974,8 +974,11 @@ def remove_overlapping_data_between_historical_regions_and_successors( columns ].drop_duplicates() # Find unique years where the above combinations of item-element-years of region and successors overlap. - overlapping_years = pr.concat([historical_region_years, historical_successors_years], ignore_index=True) - overlapping_years = overlapping_years[overlapping_years.duplicated()] + if historical_region_years.empty and historical_successors_years.empty: + overlapping_years = pd.DataFrame() + else: + overlapping_years = pr.concat([historical_region_years, historical_successors_years], ignore_index=True) + overlapping_years = overlapping_years[overlapping_years.duplicated()] if not overlapping_years.empty: log.warning( f"Removing rows where historical region {historical_region} overlaps with its successors " @@ -1298,7 +1301,7 @@ def convert_variables_given_per_capita_to_total_value(tb: Table, elements_metada # All variables in the custom_elements_and_units.csv file with "was_per_capita" True will be converted into # total (non-per-capita) values. element_codes_that_were_per_capita = list( - elements_metadata[elements_metadata["was_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["was_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_that_were_per_capita) > 0: tb = tb.copy() @@ -1349,7 +1352,7 @@ def add_per_capita_variables(tb: Table, elements_metadata: Table) -> Table: # Find element codes that have to be made per capita. element_codes_to_make_per_capita = list( - elements_metadata[elements_metadata["make_per_capita"]]["element_code"].unique() + elements_metadata[elements_metadata["make_per_capita"] == 1]["element_code"].unique() ) if len(element_codes_to_make_per_capita) > 0: log.info("add_per_capita_variables", shape=tb_with_pc_variables.shape) diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml index 1f6e46694a1..550e22f18f0 100644 --- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml +++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.meta.yml @@ -52,7 +52,7 @@ tables: conversionFactor: 1 years_colonized: - title: Years the country has been colonized + title: Years a country was an European overseas colony unit: "years" short_unit: "years" description_short: | @@ -64,7 +64,7 @@ tables: description_from_producer: "" processing_level: major display: - name: Years colonized + name: Years a country was an European overseas colony entityAnnotationsMap: "" numDecimalPlaces: 0 conversionFactor: 1 diff --git a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py index f2b706763db..fa61e722384 100644 --- a/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py +++ b/etl/steps/data/garden/harvard/2023-09-18/colonial_dates_dataset.py @@ -182,6 +182,9 @@ def regional_aggregations(tb: Table, tb_pop: Table) -> Table: # Define non-colonies identifiers for `colonizer` non_colonies = ["zz. Colonizer", "zzz. Not colonized", "zzzz. No longer colonized"] + # Backwards compatibility + tb_regions["colonizer"] = tb_regions["colonizer"].astype(object).fillna(np.nan) + # Define colony_number, which is 1 if countries are not in non_colonies and colony_pop, which is the product of colony and population tb_regions["colony_number"] = tb_regions["colonizer"].apply(lambda x: 0 if x in non_colonies else 1) tb_regions["colony_pop"] = tb_regions["population"] * tb_regions["colony_number"] @@ -327,7 +330,7 @@ def correct_european_countries(tb: Table) -> Table: european_countries = geo.list_countries_in_region(region="Europe") # If the country is in european_countries and last_colonizer is not "zzzz. Never colonized", assign nan to colonizer - for col in ["colonizer", "colonizer_grouped", "last_colonizer", "last_colonizer_grouped"]: + for col in ["colonizer", "colonizer_grouped", "last_colonizer", "years_colonized", "last_colonizer_grouped"]: tb[col] = tb[col].where( ~((tb["country"].isin(european_countries)) & (tb["last_colonizer_grouped"] == "zzzz. Never colonized")), np.nan, diff --git a/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py b/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py index 5131c53c201..0e5ed5f2577 100644 --- a/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py +++ b/etl/steps/data/garden/homicide/2023-01-03/who_mort_db.py @@ -48,7 +48,7 @@ def run(dest_dir: str) -> None: ds_meadow = Dataset(DATA_DIR / "meadow/homicide/2023-01-03/who_mort_db") tb_meadow = ds_meadow["who_mort_db"] - df = pd.DataFrame(tb_meadow) + df = pd.DataFrame(tb_meadow).astype({"number_of_deaths": float}) log.info("who_mort_db.exclude_countries") df = exclude_countries(df) @@ -92,7 +92,7 @@ def run(dest_dir: str) -> None: def clean_up_dimensions(df: pd.DataFrame) -> pd.DataFrame: sex_dict = {"All": "Both Sexes", "Male": "Males", "Female": "Females", "Unknown": "Unknown sex"} age_dict = {"Age_all": "All ages", "Age_unknown": "Unknown age"} - df = df.replace({"sex": sex_dict, "age_group_code": age_dict}) + df = df.astype({"sex": str, "age_group_code": str}).replace({"sex": sex_dict, "age_group_code": age_dict}) return df diff --git a/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py b/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py index 38473bb5aba..5c3ec2b8b5d 100644 --- a/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py +++ b/etl/steps/data/garden/ihme_gbd/2023-06-14/prevalence_dalys_world.py @@ -102,7 +102,7 @@ def make_table_prevalence(ds: Dataset) -> Table: "share_eating_disorders": "Eating disorders", "share_schizophrenia_disorders": "Schizophrenia", } - tb = tb.rename(columns=column_rename)[set(column_rename.values()) | {"year"}] + tb = tb.rename(columns=column_rename)[list(set(column_rename.values()) | {"year"})] # Unpivot tb = tb.melt(id_vars=["year"], var_name="cause", value_name="share_rate") diff --git a/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml b/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml index dc0290db97b..7ccf28e173a 100644 --- a/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml +++ b/etl/steps/data/garden/nasa/2023-03-06/ozone_hole_area.meta.yml @@ -16,9 +16,6 @@ dataset: Minimum and mean Southern Hemisphere daily ozone concentrations, measured in Dobson Units (DU). This dataset should be next updated by the source every year. We will update it on Our World in Data soon after the new version is published. At the link above you can directly access the source page and see the latest available data. - licenses: - - name: # TO BE FILLED. Example: Testing License Name - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.countries.json b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.countries.json new file mode 100644 index 00000000000..8ac42f96724 --- /dev/null +++ b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.countries.json @@ -0,0 +1,47 @@ +{ + "Australia": "Australia", + "Austria": "Austria", + "Belgium": "Belgium", + "Brazil": "Brazil", + "Bulgaria": "Bulgaria", + "Canada": "Canada", + "Chile": "Chile", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Czechia": "Czechia", + "Denmark": "Denmark", + "Estonia": "Estonia", + "Finland": "Finland", + "France": "France", + "Germany": "Germany", + "Greece": "Greece", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Japan": "Japan", + "Latvia": "Latvia", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Mexico": "Mexico", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Norway": "Norway", + "Poland": "Poland", + "Portugal": "Portugal", + "Romania": "Romania", + "Russia": "Russia", + "Slovak Republic": "Slovakia", + "Slovenia": "Slovenia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "United Kingdom": "United Kingdom", + "United States": "United States", + "China (People\u2019s Republic of)": "China", + "Korea": "South Korea", + "T\u00fcrkiye": "Turkey" +} \ No newline at end of file diff --git a/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.meta.yml b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.meta.yml new file mode 100644 index 00000000000..35c69c2470e --- /dev/null +++ b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.meta.yml @@ -0,0 +1,366 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + processing_level: minor + display: &common-display + tolerance: 5 + presentation: + topic_tags: + - Economic Inequality + - Poverty + + gini: |- + The [Gini coefficient](#dod:gini) measures inequality on a scale from 0 to 1. Higher values indicate higher inequality. + disposable_income: |- + Income is ‘post-tax’ — measured after taxes have been paid and most government benefits have been received. + market_income: |- + Income is ‘pre-tax’ — measured before taxes have been paid and most government benefits have been received. However, data for China, Hungary, Mexico, Turkey as well as part of the data for Greece refer to the income post taxes and before transfers. + gross_income: |- + Income here is measured before taxes and after benefits. + equivalization: |- + Income has been equivalized – adjusted to account for the fact that people in the same household can share costs like rent and heating. + additional_info: |- + {definitions.age_groups}, and also the income definition is the newest from the OECD since 2012. For more information on the methodology, visit the [OECD Income Distribution Database (IDD)](http://www.oecd.org/social/income-distribution-database.htm). + covid: |- + Survey estimates for 2020 are subject to additional uncertainty and are to be treated with extra caution, as in most countries the survey fieldwork was affected by the Coronavirus (COVID-19) pandemic. + + age_groups: |- + <% if age == "Total" %> + The entire population of each country is considered + <% elif age == "Working population" %> + Only working-age population is considered (from 18 to 65 years old) + <% elif age == "Over 65 years" %> + Only population over 65 years old is considered + <%- endif -%> + + source_gini: |- + The Gini coefficient is based on the comparison of cumulative proportions of the population against cumulative proportions of income they receive, and it ranges between 0 in the case of perfect equality and 1 in the case of perfect inequality. + source_gini_market: |- + The Gini coefficient for market income refers to income before taxes and transfers. However, data for China, Hungary, Mexico and Turkey as well as data for Greece from the Household Budget Survey refer to the income post taxes and before transfers. + source_palma_ratio: |- + The Palma ratio is the share of all income received by the 10% people with highest disposable income divided by the share of all income received by the 40% people with the lowest disposable income. + source_p90_p10_ratio: |- + The P90/P10 ratio is the ratio of the upper bound value of the ninth decile (i.e. the 10% of people with highest income) to that of the upper bound value of the first decile. + source_p90_p50_ratio: |- + The P90/P50 ratio is the ratio of the upper bound value of the ninth decile to the median income. + source_p50_p10_ratio: |- + The P50/P10 ratio is the ratio of median income to the upper bound value of the first decile. + source_s80_s20_ratio: |- + The S80/S20 ratio is the share of all income received by the top quintile divided by the share of the first, or the ratio of the average income of the top quintile to that of the first. + source_headcount_ratio_market: |- + Data for Hungary, Mexico and Turkey as well as data for Greece from the Household Budget Survey refer to the income post taxes and before transfers. + source_income_definition_2012: |- + Data calculated according to the new OECD Terms of reference. Compared to previous terms of reference, these include a more detailed breakdown of current transfers received and paid by households as well as a revised definition of household income, including the value of goods produced for own consumption as an element of self-employed income. + source_recommended_uses_and_limitations: |- + The OECD Income Distribution database (IDD) has been developed to benchmark and monitor countries’ performance in the field of income inequality and poverty. It contains a number of standardised indicators based on the central concept of “equivalised household disposable income”, i.e. the total income received by the households less the current taxes and transfers they pay, adjusted for household size with an equivalence scale. While household income is only one of the factors shaping people’s economic well-being, it is also the one for which comparable data for all OECD countries are most common. Income distribution has a long-standing tradition among household-level statistics, with regular data collections going back to the 1980s (and sometimes earlier) in many OECD countries. + + Achieving comparability in this field is a challenge, as national practices differ widely in terms of concepts, measures, and statistical sources. In order to maximise international comparability as well as inter-temporal consistency of data, the IDD data collection and compilation process is based on a common set of statistical conventions (e.g. on income concepts and components). The information obtained by the OECD through a network of national data providers, via a standardized questionnaire, is based on national sources that are deemed to be most representative for each country. + + Small changes in estimates between years should be treated with caution as they may not be statistically significant. + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + + + +tables: + income_distribution_database: + variables: + gini_disposable: + title: Gini coefficient (disposable income) - <> + unit: "" + short_unit: "" + description_short: "{definitions.gini}" + description_key: + - "{definitions.disposable_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_gini} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: Gini coefficient (disposable income) + display: + name: Gini coefficient (disposable income) + numDecimalPlaces: 2 + <<: *common-display + + gini_gross: + title: Gini coefficient (gross income) - <> + unit: "" + short_unit: "" + description_short: "{definitions.gini}" + description_key: + - "{definitions.gross_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_gini} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: Gini coefficient (gross income) + display: + name: Gini coefficient (gross income) + numDecimalPlaces: 2 + <<: *common-display + + gini_market: + title: Gini coefficient (market income) - <> + unit: "" + short_unit: "" + description_short: "{definitions.gini}" + description_key: + - "{definitions.market_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_gini} + + {definitions.source_gini_market} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: Gini coefficient (market income) + display: + name: Gini coefficient (market income) + numDecimalPlaces: 2 + <<: *common-display + + p50_p10_ratio_disposable: + title: P50/P10 ratio (disposable income) - <> + unit: "" + short_unit: "" + description_short: The P50/P10 ratio measures the degree of inequality within the poorest half of the population. A ratio of 2 means that the median income or consumption is two times higher than that of someone just falling in the poorest tenth of the population. + description_key: + - "{definitions.disposable_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_p50_p10_ratio} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: P50/P10 ratio (disposable income) + display: + name: P50/P10 ratio (disposable income) + numDecimalPlaces: 1 + <<: *common-display + + p90_p10_ratio_disposable: + title: P90/P10 ratio (disposable income) - <> + unit: "" + short_unit: "" + description_short: P90 and P10 are the levels of income or consumption below which 90% and 10% of the population live, respectively. This variable gives the ratio of the two. It is a measure of inequality that indicates the gap between the richest and poorest tenth of the population. + description_key: + - "{definitions.disposable_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_p90_p10_ratio} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: P90/P10 ratio (disposable income) + display: + name: P90/P10 ratio (disposable income) + numDecimalPlaces: 1 + <<: *common-display + + p90_p50_ratio_disposable: + title: P90/P50 ratio (disposable income) - <> + unit: "" + short_unit: "" + description_short: The P90/P50 ratio measures the degree of inequality within the richest half of the population. A ratio of 2 means that someone just falling in the richest tenth of the population has twice the median income or consumption. + description_key: + - "{definitions.disposable_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_p90_p50_ratio} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: P90/P50 ratio (disposable income) + display: + name: P90/P50 ratio (disposable income) + numDecimalPlaces: 1 + <<: *common-display + + palma_ratio_disposable: + title: Palma ratio (disposable income) - <> + unit: "" + short_unit: "" + description_short: The Palma ratio is a measure of inequality that divides the share received by the richest 10% by the share of the poorest 40%. Higher values indicate higher inequality. + description_key: + - "{definitions.disposable_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_palma_ratio} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: Palma ratio (disposable income) + display: + name: Palma ratio (disposable income) + numDecimalPlaces: 1 + <<: *common-display + + s80_s20_ratio_disposable: + title: S80/S20 ratio (disposable income) - <> + unit: "" + short_unit: "" + description_short: The share of income of the richest 20% divided by the share of the poorest 20%. + description_key: + - "{definitions.disposable_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_s80_s20_ratio} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: S80/S20 ratio (disposable income) + display: + name: S80/S20 ratio (disposable income) + numDecimalPlaces: 1 + <<: *common-display + + headcount_ratio_disposable_50_median: + title: 50% of median - Share of population in poverty (disposable income) - <> + unit: "%" + short_unit: "%" + description_short: Percentage of population living in households with an income per person below 50% of median. + description_key: + - "{definitions.disposable_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: 50% of median - Share of population in poverty (disposable income) + display: + name: 50% of median - Share of population in poverty (disposable income) + numDecimalPlaces: 1 + <<: *common-display + + headcount_ratio_disposable_60_median: + title: 60% of median - Share of population in poverty (disposable income) - <> + unit: "%" + short_unit: "%" + description_short: Percentage of population living in households with an income per person below 60% of median. + description_key: + - "{definitions.disposable_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: 60% of median - Share of population in poverty (disposable income) + display: + name: 60% of median - Share of population in poverty (disposable income) + numDecimalPlaces: 1 + <<: *common-display + + headcount_ratio_market_50_median: + title: 50% of median - Share of population in poverty (market income) - <> + unit: "%" + short_unit: "%" + description_short: Percentage of population living in households with an income per person below 50% of median. + description_key: + - "{definitions.market_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_headcount_ratio_market} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: 50% of median - Share of population in poverty (market income) + display: + name: 50% of median - Share of population in poverty (market income) + numDecimalPlaces: 1 + <<: *common-display + + headcount_ratio_market_60_median: + title: 60% of median - Share of population in poverty (market income) - <> + unit: "%" + short_unit: "%" + description_short: Percentage of population living in households with an income per person below 60% of median. + description_key: + - "{definitions.market_income}" + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_headcount_ratio_market} + + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: 60% of median - Share of population in poverty (market income) + display: + name: 60% of median - Share of population in poverty (market income) + numDecimalPlaces: 1 + <<: *common-display + + gini_reduction: + title: Percentage reduction in Gini coefficient (before vs. after tax) - <> + unit: "%" + short_unit: "%" + description_short: "This is the percentage difference between the Gini coefficient before taxes and benefits and the Gini coefficient after taxes and benefits." + description_key: + - "{definitions.equivalization}" + - "{definitions.additional_info}" + - "{definitions.covid}" + description_from_producer: |- + {definitions.source_income_definition_2012} + + {definitions.source_recommended_uses_and_limitations} + presentation: + title_public: Percentage reduction in Gini coefficient + display: + name: Percentage reduction in Gini coefficient + numDecimalPlaces: 1 + <<: *common-display + + diff --git a/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py new file mode 100644 index 00000000000..1fc4b9d7e34 --- /dev/null +++ b/etl/steps/data/garden/oecd/2024-04-10/income_distribution_database.py @@ -0,0 +1,211 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from typing import List + +import owid.catalog.processing as pr +from owid.catalog import Table +from structlog import get_logger +from tabulate import tabulate + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Initialize logger. +log = get_logger() + +# Define new names for categories +INDICATOR_NAMES = { + "Gini (disposable income)": "gini_disposable", + "Gini (gross income)": "gini_gross", + "Gini (market income)": "gini_market", + "P50/P10 disposable income decile ratio": "p50_p10_ratio_disposable", + "P90/P10 disposable income decile ratio": "p90_p10_ratio_disposable", + "P90/P50 disposable income decile ratio": "p90_p50_ratio_disposable", + "Palma ratio (disposable income)": "palma_ratio_disposable", + "Quintile share ratio (disposable income)": "s80_s20_ratio_disposable", + "Poverty rate based on disposable income": "headcount_ratio_disposable", + "Poverty rate based on market income": "headcount_ratio_market", +} + +POVERTY_LINES = { + "Not applicable": "not_applicable", + "50% of the national\xa0median disposable income": "50_median", + "60% of the national\xa0median disposable income": "60_median", +} + +AGE_GROUPS = {"From 18 to 65 years": "Working population", "Over 65 years": "Over 65 years", "Total": "Total"} + +# Set table format when printing +TABLEFMT = "pretty" + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("income_distribution_database") + + # Read table from meadow dataset. + tb = ds_meadow["income_distribution_database"].reset_index() + + # + # Process data. + tb = rename_and_create_columns(tb) + + tb = create_relative_poverty_columns(tb) + + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + + sanity_checks(tb) + + tb = tb.format(["country", "year", "age"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def rename_and_create_columns(tb: Table) -> Table: + """ + Rename categories in measure, poverty_line and age columns and make the table wide. + Also, add a gini_reduction column. + """ + # Assert if all keys of dictionary are in the columns. + assert set(INDICATOR_NAMES.keys()) == set(tb["measure"]), "Not all expected categories are in the measure column" + assert set(POVERTY_LINES.keys()) == set( + tb["poverty_line"] + ), "Not all expected categories are in the poverty_line column" + assert set(AGE_GROUPS.keys()) == set(tb["age"]), "Not all expected categories are in the age column" + + # Rename categories in measure, poverty_line and age columns. + tb["measure"] = tb["measure"].replace(INDICATOR_NAMES) + tb["poverty_line"] = tb["poverty_line"].replace(POVERTY_LINES) + tb["age"] = tb["age"].replace(AGE_GROUPS) + + # Make the table wide, using measure as columns. + tb = tb.pivot(index=["country", "year", "poverty_line", "age"], columns="measure", values="value").reset_index() + + # Create a variable that calculates the reduction from gini_market to gini_disposable + tb["gini_reduction"] = (tb["gini_market"] - tb["gini_disposable"]) / tb["gini_market"] * 100 + + return tb + + +def create_relative_poverty_columns(tb: Table) -> Table: + """ + Pivot table for headcount ratios and create multiple relative poverty columns with the poverty lines. + """ + + tb_inequality = tb.copy() + tb_poverty = tb.copy() + + # Filter poverty_line column + tb_inequality = tb_inequality[tb_inequality["poverty_line"] == "not_applicable"].reset_index(drop=True) + tb_poverty = tb_poverty[tb_poverty["poverty_line"] != "not_applicable"].reset_index(drop=True) + + # Define columns for both tables: tb_inequality has all the columns not containing headcount_ratio + # tb_poverty has all the columns containing headcount_ratio + inequality_columns = [c for c in tb_inequality.columns if "headcount_ratio" not in c] + poverty_columns = [c for c in tb_poverty.columns if "headcount_ratio" in c] + + tb_inequality = tb_inequality[inequality_columns] + tb_poverty = tb_poverty[["country", "year", "poverty_line", "age"] + poverty_columns] + + # Make tb_poverty wider + tb_poverty = tb_poverty.pivot( + index=["country", "year", "age"], columns="poverty_line", values=poverty_columns, join_column_levels_with="_" + ).reset_index(drop=True) + + # Remove poverty_line column in tb_inequality + tb_inequality = tb_inequality.drop(columns=["poverty_line"], errors="raise") + + # Merge both tables + tb = pr.merge(tb_inequality, tb_poverty, on=["country", "year", "age"]) + + return tb + + +def sanity_checks(tb: Table) -> None: + """Run several sanity checks on the table.""" + + # Define headcount ratio columns + headcount_ratio_columns = [c for c in tb.columns if "headcount_ratio" in c] + + # Divide headcount_ratio columns by 100 + tb[headcount_ratio_columns] = tb[headcount_ratio_columns] / 100 + + check_between_0_and_1( + tb, + [ + "gini_disposable", + "gini_gross", + "gini_market", + ] + + headcount_ratio_columns, + ) + + # Multiply headcount_ratio columns by 100 + tb[headcount_ratio_columns] = tb[headcount_ratio_columns] * 100 + + check_negative_values(tb) + + return None + + +def check_between_0_and_1(tb: Table, variables: List[str]) -> None: + """ + Check that indicators are between 0 and 1 + """ + + tb = tb.copy() + + for v in variables: + # Filter only values lower than 0 or higher than 1 + mask = (tb[v] > 1) | (tb[v] < 0) + tb_error = tb[mask].copy().reset_index() + + if not tb_error.empty: + log.fatal( + f"""Values for {v} are not between 0 and 1: + {tabulate(tb_error[['country', 'year', 'poverty_line', 'age', v]], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + + return None + + +def check_negative_values(tb: Table) -> None: + """ + Check if there are negative values in the variables + """ + + tb = tb.copy() + + # Define variables: all in the table, except for country, year and age + variables = [c for c in tb.columns if c not in ["country", "year", "age"]] + + for v in variables: + # Create a mask to check if any value is negative + mask = tb[v] < 0 + tb_error = tb[mask].reset_index(drop=True).copy() + + if not tb_error.empty: + log.fatal( + f"""{len(tb_error)} observations for {v} are negative: + {tabulate(tb_error[['country', 'year', 'poverty_line', 'age', v]], headers = 'keys', tablefmt = TABLEFMT)}""" + ) + + return None diff --git a/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py b/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py index 6ef195fa15b..edd8db37ee3 100644 --- a/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py +++ b/etl/steps/data/garden/plastic_waste/2023-09-26/geyer_2017.py @@ -1,5 +1,6 @@ """Load a meadow dataset and create a garden dataset.""" import owid.catalog.processing as pr +import pandas as pd from etl.helpers import PathFinder, create_dataset @@ -27,8 +28,8 @@ def run(dest_dir: str) -> None: for year in range(2016, 2019): # 2019 is the stop value and is not included last_value = tb.loc[tb.index[-1], "plastic_production"] # Getting the last value in the 'Value' column new_value = last_value * (1 + growth_rate) # Calculating the value for the new year - new_row = {"country": "World", "year": year, "plastic_production": new_value} # Creating a new row - tb = tb.append(new_row, ignore_index=True) # Adding the new row to the DataFrame + new_row = pd.Series({"country": "World", "year": year, "plastic_production": new_value}) # Creating a new row + tb.loc[len(tb)] = new_row tb["plastic_production"] = tb["plastic_production"] * 1e6 # Convert to millions # Add data from OECD for 2019 diff --git a/etl/steps/data/garden/regions/2023-01-01/regions.py b/etl/steps/data/garden/regions/2023-01-01/regions.py index 1fb987a4b4a..086ea166032 100644 --- a/etl/steps/data/garden/regions/2023-01-01/regions.py +++ b/etl/steps/data/garden/regions/2023-01-01/regions.py @@ -53,6 +53,9 @@ def parse_raw_definitions(df: pd.DataFrame) -> pd.DataFrame: def run_sanity_checks(df: pd.DataFrame) -> None: + # Check that all regions have a name. + assert df[df["name"].isnull()].empty, f"Some regions do not have a name: {set(df[df['name'].isnull()]['code'])}" + # Check that there are no repeated codes. duplicated_codes = df[df["code"].duplicated()]["code"].tolist() assert len(duplicated_codes) == 0, f"Duplicated codes found: {duplicated_codes}" @@ -125,6 +128,9 @@ def run(dest_dir: str) -> None: lambda x: json.dumps(sum(list(x), [])) if pd.notna(x.values) else x ) + # Ensure "is_historical" is boolean. + tb_regions = tb_regions.astype({"is_historical": bool}) + # Set an appropriate index and sort conveniently. tb_regions = tb_regions.set_index("code", verify_integrity=True).sort_index() diff --git a/etl/steps/data/garden/regions/2023-01-01/regions.yml b/etl/steps/data/garden/regions/2023-01-01/regions.yml index 9c48b522abd..2239ada2813 100644 --- a/etl/steps/data/garden/regions/2023-01-01/regions.yml +++ b/etl/steps/data/garden/regions/2023-01-01/regions.yml @@ -1847,3 +1847,227 @@ end_year: 1902 successors: - "ZAF" + +# WHO regions +- code: WHO_AMR + name: "Americas (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "ATG" + - "ARG" + - "BHS" + - "BRB" + - "BOL" + - "BRA" + - "CAN" + - "CHL" + - "COL" + - "CRI" + - "CUB" + - "DMA" + - "DOM" + - "ECU" + - "SLV" + - "GRD" + - "GTM" + - "HTI" + - "HND" + - "JAM" + - "MEX" + - "NIC" + - "PAN" + - "PRY" + - "PER" + - "KNA" + - "LCA" + - "SUR" + - "TTO" + - "USA" + - "URY" + - "VEN" + +- code: WHO_AFR + name: "Africa (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "DZA" + - "AGO" + - "BEN" + - "BWA" + - "BFA" + - "BDI" + - "CMR" + - "CPV" + - "CAF" + - "TCD" + - "COM" + - "COG" + - "CIV" + - "COD" + - "GNQ" + - "ERI" + - "SWZ" + - "ETH" + - "GAB" + - "GMB" + - "GHA" + - "GIN" + - "GNB" + - "KEN" + - "LSO" + - "LBR" + - "MDG" + - "MWI" + - "MLI" + - "MRT" + - "MUS" + - "MOZ" + - "NAM" + - "NER" + - "NGA" + - "RWA" + - "STP" + - "SEN" + - "SYC" + - "SLE" + - "ZAF" + - "SSD" + - "TZA" + - "TGO" + - "UGA" + - "ZMB" + - "ZWE" +- code: WHO_EMR + name: "Eastern Mediterranean (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "AFG" + - "BHR" + - "DJI" + - "EGY" + - "IRN" + - "IRQ" + - "JOR" + - "KWT" + - "LBN" + - "LBY" + - "MAR" + - "OMN" + - "PAK" + - "QAT" + - "SAU" + - "SOM" + - "SDN" + - "SYR" + - "TUN" + - "ARE" + - "YEM" +- code: WHO_EUR + name: "Europe (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "ALB" + - "AND" + - "ARM" + - "AUT" + - "AZE" + - "BLR" + - "BEL" + - "BIH" + - "BGR" + - "HRV" + - "CYP" + - "CZE" + - "DNK" + - "EST" + - "FIN" + - "FRA" + - "GEO" + - "DEU" + - "GRC" + - "HUN" + - "ISL" + - "IRL" + - "ISR" + - "ITA" + - "KAZ" + - "KGZ" + - "LVA" + - "LTU" + - "LUX" + - "MLT" + - "MDA" + - "MCO" + - "MNE" + - "NLD" + - "MKD" + - "NOR" + - "POL" + - "PRT" + - "ROU" + - "RUS" + - "SMR" + - "SRB" + - "SVK" + - "SVN" + - "ESP" + - "SWE" + - "CHE" + - "TJK" + - "TUR" + - "TKM" + - "UKR" + - "GBR" + - "UZB" +- code: WHO_SEAR + name: "South-East Asia (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "BGD" + - "BTN" + - "PRK" + - "IND" + - "IDN" + - "MDV" + - "MMR" + - "NPL" + - "THA" + - "TLS" + - "LKA" +- code: WHO_WPAC + name: "Western Pacific (WHO)" + region_type: "aggregate" + defined_by: who + members: + - "AUS" + - "BRN" + - "KHM" + - "CHN" + - "COK" + - "FJI" + - "JPN" + - "KIR" + - "LAO" + - "MYS" + - "MHL" + - "FSM" + - "MNG" + - "NRU" + - "NZL" + - "NIU" + - "PLW" + - "PNG" + - "PHL" + - "WSM" + - "SGP" + - "SLB" + - "KOR" + - "TON" + - "TUV" + - "VUT" + - "VNM" diff --git a/etl/steps/data/garden/tourism/2023-05-05/unwto.py b/etl/steps/data/garden/tourism/2023-05-05/unwto.py index 54c1a2b5dbf..36208493c09 100644 --- a/etl/steps/data/garden/tourism/2023-05-05/unwto.py +++ b/etl/steps/data/garden/tourism/2023-05-05/unwto.py @@ -63,14 +63,14 @@ def run(dest_dir: str) -> None: merged_df_drop_ = merged_df.loc[~merged_df.country.isin(["Saba", "Sint Eustatius", "Bonaire"])] # Concatenate 'merged_df_drop_' and 'sum_bon_sint_saba' into a single DataFrame 'merged_df_concat'. # The rows of 'sum_bon_sint_saba' will be appended to 'merged_df_drop_'. - merged_df_concat = merged_df_drop_.append(sum_bon_sint_saba, ignore_index=True) + merged_df_concat = pd.concat([merged_df_drop_, sum_bon_sint_saba], ignore_index=True) # Set index, check that it's unique and reset index - assert not merged_df_concat[["country", "year"]].duplicated().any(), "Index is not well constructed" + assert not merged_df_concat[["country", "year"]].duplicated().any(), "Index is not well constructed" # type: ignore # Aggregate data by region (decided not to do for now) # Africa, Oceania, and income level categories - # regions_ = ["North America", + ## regions_ = ["North America", # "South America", # "Europe", # "Africa", diff --git a/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py b/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py index 9a2ea51f26d..ae7a4784f82 100644 --- a/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py +++ b/etl/steps/data/garden/tuberculosis/2023-11-27/unhlm_commitments.py @@ -76,7 +76,7 @@ def add_meaning_to_codes(tb: Table) -> Table: "min_tra_collab", ] - tb[cols_0_1_3] = tb[cols_0_1_3].astype("category").replace({0: "No", 1: "Yes", 3: "Don't know"}) + tb[cols_0_1_3] = tb[cols_0_1_3].astype(object).replace({0: "No", 1: "Yes", 3: "Don't know"}).astype("category") tb[cols_other] = ( tb[cols_other] .astype("object") diff --git a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py index 104a3d08220..66c12efbfd0 100644 --- a/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py +++ b/etl/steps/data/garden/un/2024-01-17/urban_agglomerations_definition_count.py @@ -35,7 +35,9 @@ def run(dest_dir: str) -> None: # Add a 'year' column filled with 2018 df_counts["year"] = 2018 - df_counts["countries"] = df_counts["countries"].apply(lambda x: f"{x:,} inhabitants" if isinstance(x, int) else x) + df_counts["countries"] = ( + df_counts["countries"].astype(object).apply(lambda x: f"{x:,} inhabitants" if isinstance(x, int) else x) + ) # Replace '' values in the 'countries' column with 'No minimum population threshold' df_counts["countries"] = df_counts["countries"].astype(str).replace("", "No minimum population threshold") diff --git a/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml b/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml index d7c3699b9a3..919ca832bd0 100644 --- a/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml +++ b/etl/steps/data/garden/unep/2023-03-17/consumption_controlled_substances.meta.yml @@ -17,9 +17,6 @@ dataset: Negative values for a given year imply that quantities destroyed or quantities exported for the year exceeded the sum of production and imports, implying that the destroyed or exported quantities came from stockpiles. - licenses: - - name: # Example: Testing License Name - url: # Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml b/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml index 03e7ade1190..d4722538299 100644 --- a/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml +++ b/etl/steps/data/garden/war/2023-01-18/dunnigan_martel_1987.meta.yml @@ -17,7 +17,6 @@ dataset: This dataset provides information on military and civilian deaths from wars, drawn from the book by Dunnigan and Martel (1987). licenses: - name: Doubleday (1987) - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml b/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml index 790bc299ceb..7f77a80f561 100644 --- a/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml +++ b/etl/steps/data/garden/war/2023-01-18/eckhardt_1991.meta.yml @@ -17,7 +17,6 @@ dataset: This dataset provides information on military and civilian deaths from wars, drawn from the chapter by Eckhardt (1991). licenses: - name: World Priorities - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml b/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml index 2f4f3db8fed..3ca1fb6a42a 100644 --- a/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml +++ b/etl/steps/data/garden/war/2023-01-18/kaye_1985.meta.yml @@ -17,7 +17,6 @@ dataset: This dataset provides information on direct and indirect military and civilian deaths from major armed conflicts, drawn from the report by Kaye et al. (1985). licenses: - name: Department of National Defence, Canada, Operational Research and Analysis Establishment, 1985 - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml b/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml index 6a12a6987e2..540549a7a8a 100644 --- a/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml +++ b/etl/steps/data/garden/war/2023-01-18/sutton_1971.meta.yml @@ -4,7 +4,6 @@ all_sources: published_by: Sutton, Antony. 1972. Wars and Revolutions in the Nineteenth Century. Hoover Institution Archives. url: https://searchworks.stanford.edu/view/3023823 date_accessed: 2023-01-09 - publication_date: # TO BE FILLED. Example: 2023-01-01 publication_year: 1971 # description: Source description. @@ -15,9 +14,6 @@ dataset: version: 2023-01-18 description: | This dataset provides information on deaths from wars and revolutions, using data from Sutton (1972). - licenses: - - name: Unknown - url: # TO BE FILLED. Example: https://url_of_testing_source.com/license sources: - *source-testing diff --git a/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py b/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py index 7f7c870a0aa..c5c662fd39c 100644 --- a/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py +++ b/etl/steps/data/garden/war/2024-01-09/nuclear_weapons_inventories.py @@ -29,7 +29,7 @@ def run(dest_dir: str) -> None: tb = tb.rename(columns=COLUMNS, errors="raise") # Looking at the original dashboards, it seems that missing values are shown as zeros. - tb = tb.fillna(0) + tb["number_of_warheads"] = tb["number_of_warheads"].fillna(0) # Harmonize country names. tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) diff --git a/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py b/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py index 671d20ecf9d..a5a5bcd91d0 100644 --- a/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py +++ b/etl/steps/data/garden/war/2024-01-09/status_of_world_nuclear_forces.py @@ -38,7 +38,8 @@ def run(dest_dir: str) -> None: # Looking at the original dashboard, it seems that missing values are shown as zeros. # https://public.tableau.com/app/profile/kate.kohn/viz/EstimatedGlobalNuclearWarheadInventories2021/Dashboard1 - tb = tb.fillna(0) + cols = [c for c in tb.columns if c not in ["country", "year"]] + tb[cols] = tb[cols].fillna(0) # Harmonize country names. tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) diff --git a/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py b/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py index 8941f0c9885..554eafe4d68 100644 --- a/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py +++ b/etl/steps/data/garden/war/2024-01-25/nuclear_weapons_tests.py @@ -35,7 +35,7 @@ def run(dest_dir: str) -> None: # Process data. # # By looking at the original table, it seems clear that empty cells mean zero. - tb = tb.fillna(0) + tb = tb.astype(object).fillna(0) # Temporarily convert all columns to string (to avoid issues with categorical variables). tb = tb.astype(str) diff --git a/etl/steps/data/garden/wash/2024-01-06/who.meta.yml b/etl/steps/data/garden/wash/2024-01-06/who.meta.yml index 8feb8a86f07..4e5167f58b9 100644 --- a/etl/steps/data/garden/wash/2024-01-06/who.meta.yml +++ b/etl/steps/data/garden/wash/2024-01-06/who.meta.yml @@ -25,7 +25,7 @@ definitions: safely_managed_sanitation_desc: &safely_managed_sanitation_desc | Safely managed sanitation services are defined as improved sanitation facilities that are not shared with other households and where excreta are safely disposed in situ or transported and treated off-site. basic_drinking_water_desc: &basic_drinking_water_desc | - Basic drinking water services are defined as an improved drinking water source,provided collection time is not more than 30 minutes for a roundtrip including queuing. + Basic drinking water services are defined as an improved drinking water source, provided collection time is not more than 30 minutes for a roundtrip including queuing. limited_drinking_water_desc: &limited_drinking_water_desc | Limited drinking water services are defined as drinking water from an improved source for which collection time exceeds 30 minutes for a roundtrip including queuing. improved_drinking_water_desc: &improved_drinking_water_desc | diff --git a/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb b/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb index 2c7a7bc580d..69a65f20899 100644 --- a/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb +++ b/etl/steps/data/garden/wb/2021-07-01/wb_income.ipynb @@ -86,10 +86,20 @@ "id": "e001fe46", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/plain": [ - "PosixPath('/Users/mojmir/projects/etl/data/meadow/wb/2021-07-01/wb_income')" + "\u001b[1;35mPosixPath\u001b[0m\u001b[1m(\u001b[0m\u001b[32m'/Users/mojmir/projects/etl2/data/meadow/wb/2021-07-01/wb_income'\u001b[0m\u001b[1m)\u001b[0m" ] }, "execution_count": 4, @@ -103,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "id": "134ea32a-77b4-4e4c-af5c-400f6edd5866", "metadata": {}, "outputs": [], @@ -114,17 +124,27 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "id": "24c738cd", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/plain": [ - "['wb_income_group']" + "\u001b[1m[\u001b[0m\u001b[32m'wb_income_group'\u001b[0m\u001b[1m]\u001b[0m" ] }, - "execution_count": 15, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -135,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "id": "5553eb58-fd10-4a93-9356-859121b7bed0", "metadata": { "tags": [] @@ -148,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "id": "e9a67fe4-ca1e-4e73-b667-6cef8cc573b2", "metadata": {}, "outputs": [ @@ -162,7 +182,20 @@ { "data": { "text/html": [ - "\n", + "\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " wb_income_group\n", + " table\n", + " \n", "
wb_income_group
table