From b8564d38435e67dba14779f5bf8c83cbf0a4aa2d Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Fri, 22 Nov 2024 13:24:49 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=8A=20emissions:=20Update=20emissions?= =?UTF-8?q?=20and=20energy=20datasets=20(#3582)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 📊 emissions: Update emissions and energy datasets * Refactor emissions steps to create new release of CO2 dataset * Add snapshot links to GCB data * Archive unused steps * Improve owid_co2 * Write data and codebook to github repos as well * Create export steps to write to S3 * Create codebook in garden, and improve github and s3 steps * Fix order of rows in codebook * Add sanity checks * Minor improvements in README * Update Global Carbon Budget data * Improve format * Fix small bug in anomalist * Archive unused steps * Add commented code for sanity checks * Fix wrong link * Warn that S3 export step does not work, but for now do nothing --- apps/anomalist/anomalist_api.py | 6 +- dag/archive/emissions.yml | 69 + dag/emissions.yml | 79 +- .../garden/emissions/2024-11-13/owid_co2.py | 490 +++++++ .../gdp_and_co2_decoupling.meta.yml | 3 + .../2024-11-21/gdp_and_co2_decoupling.py | 158 +++ .../garden/emissions/2024-11-21/owid_co2.py | 490 +++++++ .../global_carbon_budget.countries.json | 283 ++++ ...obal_carbon_budget.excluded_countries.json | 7 + .../2024-11-21/global_carbon_budget.meta.yml | 514 ++++++++ .../gcp/2024-11-21/global_carbon_budget.py | 1142 +++++++++++++++++ .../gcp/2024-11-21/global_carbon_budget.py | 44 + .../gcp/2024-11-21/global_carbon_budget.py | 212 +++ .../export/github/co2_data/latest/owid_co2.py | 546 +++----- .../export/s3/co2_data/latest/owid_co2.py | 132 ++ .../gcp/2024-11-13/global_carbon_budget.py | 11 +- ...carbon_budget_fossil_co2_emissions.csv.dvc | 5 +- ...al_carbon_budget_global_emissions.xlsx.dvc | 5 +- ..._budget_land_use_change_emissions.xlsx.dvc | 5 +- ..._carbon_budget_national_emissions.xlsx.dvc | 5 +- .../gcp/2024-11-21/global_carbon_budget.py | 71 + ...carbon_budget_fossil_co2_emissions.csv.dvc | 28 + ...al_carbon_budget_global_emissions.xlsx.dvc | 28 + ..._budget_land_use_change_emissions.xlsx.dvc | 28 + ..._carbon_budget_national_emissions.xlsx.dvc | 28 + 25 files changed, 3985 insertions(+), 404 deletions(-) create mode 100644 etl/steps/data/garden/emissions/2024-11-13/owid_co2.py create mode 100644 etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.meta.yml create mode 100644 etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.py create mode 100644 etl/steps/data/garden/emissions/2024-11-21/owid_co2.py create mode 100644 etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.countries.json create mode 100644 etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.excluded_countries.json create mode 100644 etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.meta.yml create mode 100644 etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.py create mode 100644 etl/steps/data/grapher/gcp/2024-11-21/global_carbon_budget.py create mode 100644 etl/steps/data/meadow/gcp/2024-11-21/global_carbon_budget.py create mode 100644 etl/steps/export/s3/co2_data/latest/owid_co2.py create mode 100644 snapshots/gcp/2024-11-21/global_carbon_budget.py create mode 100644 snapshots/gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv.dvc create mode 100644 snapshots/gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx.dvc create mode 100644 snapshots/gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx.dvc create mode 100644 snapshots/gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx.dvc diff --git a/apps/anomalist/anomalist_api.py b/apps/anomalist/anomalist_api.py index e640f804c21..765347434bc 100644 --- a/apps/anomalist/anomalist_api.py +++ b/apps/anomalist/anomalist_api.py @@ -121,14 +121,14 @@ def renormalize_score( # Function to format population numbers. def pretty_print_number(number): - if number >= 1e9: + if pd.isna(number): + return "?" + elif int(number) >= 1e9: return f"{number/1e9:.1f}B" elif number >= 1e6: return f"{number/1e6:.1f}M" elif number >= 1e3: return f"{number/1e3:.1f}k" - elif pd.isna(number): - return "?" else: return f"{int(number)}" diff --git a/dag/archive/emissions.yml b/dag/archive/emissions.yml index 5f987f6aab9..1056ede4622 100644 --- a/dag/archive/emissions.yml +++ b/dag/archive/emissions.yml @@ -48,3 +48,72 @@ steps: data://garden/emissions/2024-02-26/gdp_and_co2_decoupling: - data://garden/worldbank_wdi/2024-05-20/wdi - data://garden/gcp/2023-12-12/global_carbon_budget + # + # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2023-12-12 to use the latest primary energy data). + # + data://meadow/gcp/2023-12-12/global_carbon_budget: + - snapshot://gcp/2023-12-12/global_carbon_budget_fossil_co2_emissions.csv + - snapshot://gcp/2023-12-12/global_carbon_budget_national_emissions.xlsx + - snapshot://gcp/2023-12-12/global_carbon_budget_land_use_change_emissions.xlsx + - snapshot://gcp/2023-12-12/global_carbon_budget_global_emissions.xlsx + # + # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2024-06-20 to use the latest primary energy data). + # + data://garden/gcp/2024-06-20/global_carbon_budget: + - data://garden/wb/2024-03-11/income_groups + - data://garden/demography/2023-03-31/population + - data://garden/energy/2024-06-20/primary_energy_consumption + - data://garden/regions/2023-01-01/regions + - data://meadow/gcp/2023-12-12/global_carbon_budget + - data://garden/ggdc/2024-04-26/maddison_project_database + # + # Decoupling of GDP and CO2 (2023). + # + data://garden/emissions/2024-06-20/gdp_and_co2_decoupling: + - data://garden/gcp/2024-06-20/global_carbon_budget + - data://garden/worldbank_wdi/2024-05-20/wdi + # + # GCP - Global Carbon Budget. + # + data://grapher/gcp/2024-06-20/global_carbon_budget: + - data://garden/gcp/2024-06-20/global_carbon_budget + # + # GCP - Global Carbon Budget. + # + data://meadow/gcp/2024-11-13/global_carbon_budget: + - snapshot://gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx + - snapshot://gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv + - snapshot://gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx + - snapshot://gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx + # + # GCP - Global Carbon Budget. + # + data://garden/gcp/2024-11-13/global_carbon_budget: + - data://garden/ggdc/2024-04-26/maddison_project_database + - data://garden/wb/2024-07-29/income_groups + - data://garden/demography/2024-07-15/population + - data://meadow/gcp/2024-11-13/global_carbon_budget + - data://garden/regions/2023-01-01/regions + - data://garden/energy/2024-06-20/primary_energy_consumption + # + # Emissions - CO2 dataset. + # + data://garden/emissions/2024-11-13/owid_co2: + - data://garden/ggdc/2024-04-26/maddison_project_database + - data://garden/demography/2023-03-31/population + - data://garden/regions/2023-01-01/regions + - data://garden/gcp/2024-11-13/global_carbon_budget + - data://garden/emissions/2024-04-08/national_contributions + - data://garden/climate_watch/2023-10-31/emissions_by_sector + - data://garden/energy/2024-06-20/primary_energy_consumption + # + # Decoupling of GDP and CO2 (2023). + # + data://garden/emissions/2024-11-13/gdp_and_co2_decoupling: + - data://garden/gcp/2024-11-13/global_carbon_budget + - data://garden/worldbank_wdi/2024-05-20/wdi + # + # GCP - Global Carbon Budget. + # + data://grapher/gcp/2024-11-13/global_carbon_budget: + - data://garden/gcp/2024-11-13/global_carbon_budget diff --git a/dag/emissions.yml b/dag/emissions.yml index 6c455380f49..1ff0fdea127 100644 --- a/dag/emissions.yml +++ b/dag/emissions.yml @@ -24,14 +24,6 @@ steps: data://grapher/andrew/2019-12-03/co2_mitigation_curves_2celsius: - data://garden/andrew/2019-12-03/co2_mitigation_curves # - # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2023-12-12 to use the latest primary energy data). - # - data://meadow/gcp/2023-12-12/global_carbon_budget: - - snapshot://gcp/2023-12-12/global_carbon_budget_fossil_co2_emissions.csv - - snapshot://gcp/2023-12-12/global_carbon_budget_global_emissions.xlsx - - snapshot://gcp/2023-12-12/global_carbon_budget_national_emissions.xlsx - - snapshot://gcp/2023-12-12/global_carbon_budget_land_use_change_emissions.xlsx - # # RFF - World Carbon Pricing (2022-09-14). # data://meadow/rff/2023-10-19/world_carbon_pricing: @@ -93,63 +85,52 @@ steps: data://grapher/emissions/2024-04-08/national_contributions: - data://garden/emissions/2024-04-08/national_contributions # - # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2024-06-20 to use the latest primary energy data). + # GCP - Global Carbon Budget. # - data://garden/gcp/2024-06-20/global_carbon_budget: - - data://garden/ggdc/2024-04-26/maddison_project_database - - data://meadow/gcp/2023-12-12/global_carbon_budget - - data://garden/demography/2023-03-31/population - - data://garden/regions/2023-01-01/regions - - data://garden/wb/2024-03-11/income_groups - - data://garden/energy/2024-06-20/primary_energy_consumption + data://meadow/gcp/2024-11-21/global_carbon_budget: + - snapshot://gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv + - snapshot://gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx + - snapshot://gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx + - snapshot://gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx # # GCP - Global Carbon Budget. # - data://grapher/gcp/2024-06-20/global_carbon_budget: - - data://garden/gcp/2024-06-20/global_carbon_budget + data://garden/gcp/2024-11-21/global_carbon_budget: + - data://garden/demography/2024-07-15/population + - data://meadow/gcp/2024-11-21/global_carbon_budget + - data://garden/regions/2023-01-01/regions + - data://garden/wb/2024-07-29/income_groups + - data://garden/energy/2024-06-20/primary_energy_consumption + - data://garden/ggdc/2024-04-26/maddison_project_database # # Decoupling of GDP and CO2 (2023). # - data://garden/emissions/2024-06-20/gdp_and_co2_decoupling: + data://garden/emissions/2024-11-21/gdp_and_co2_decoupling: - data://garden/worldbank_wdi/2024-05-20/wdi - - data://garden/gcp/2024-06-20/global_carbon_budget + - data://garden/gcp/2024-11-21/global_carbon_budget # - # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2023-12-12 to use the latest primary energy data). + # GCP - Global Carbon Budget. # - data://meadow/gcp/2024-11-13/global_carbon_budget: - - snapshot://gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx - - snapshot://gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx - - snapshot://gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv - - snapshot://gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx + data://grapher/gcp/2024-11-21/global_carbon_budget: + - data://garden/gcp/2024-11-21/global_carbon_budget # - # GCP - Global Carbon Budget (published on 2023-12-05, updated on 2024-06-20 to use the latest primary energy data). + # Emissions - CO2 dataset. # - data://garden/gcp/2024-11-13/global_carbon_budget: - - data://garden/wb/2024-07-29/income_groups - - data://garden/regions/2023-01-01/regions + data://garden/emissions/2024-11-21/owid_co2: - data://garden/demography/2024-07-15/population - - data://garden/ggdc/2024-04-26/maddison_project_database - - data://meadow/gcp/2024-11-13/global_carbon_budget + - data://garden/emissions/2024-04-08/national_contributions + - data://garden/gcp/2024-11-21/global_carbon_budget + - data://garden/regions/2023-01-01/regions - data://garden/energy/2024-06-20/primary_energy_consumption + - data://garden/climate_watch/2023-10-31/emissions_by_sector + - data://garden/ggdc/2024-04-26/maddison_project_database # - # GCP - Global Carbon Budget. - # - data://grapher/gcp/2024-11-13/global_carbon_budget: - - data://garden/gcp/2024-11-13/global_carbon_budget - # - # Decoupling of GDP and CO2 (2023). + # Emissions - CO2 dataset. # - data://garden/emissions/2024-11-13/gdp_and_co2_decoupling: - - data://garden/worldbank_wdi/2024-05-20/wdi - - data://garden/gcp/2024-11-13/global_carbon_budget + export://github/co2_data/latest/owid_co2: + - data://garden/emissions/2024-11-21/owid_co2 # # Emissions - CO2 dataset. # - export://github/co2_data/latest/owid_co2: - - data://garden/ggdc/2024-04-26/maddison_project_database - - data://garden/demography/2023-03-31/population - - data://garden/regions/2023-01-01/regions - - data://garden/emissions/2024-04-08/national_contributions - - data://garden/gcp/2024-11-13/global_carbon_budget - - data://garden/climate_watch/2023-10-31/emissions_by_sector - - data://garden/energy/2024-06-20/primary_energy_consumption + export://s3/co2_data/latest/owid_co2: + - data://garden/emissions/2024-11-21/owid_co2 diff --git a/etl/steps/data/garden/emissions/2024-11-13/owid_co2.py b/etl/steps/data/garden/emissions/2024-11-13/owid_co2.py new file mode 100644 index 00000000000..c93ad47b92b --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-13/owid_co2.py @@ -0,0 +1,490 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. + +Datasets combined: +* Global Carbon Budget - Global Carbon Project. +* National contributions to climate change - Jones et al. +* Greenhouse gas emissions by sector - Climate Watch. +* Primary energy consumption - EI & EIA. + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2023) on +GDP are included. + +""" + +import re + +import numpy as np +import pandas as pd +from owid.catalog import Dataset, Origin, Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor from tonnes to million tonnes. +TONNES_TO_MILLION_TONNES = 1e-6 + +# Select columns to use from each dataset, and how to rename them. +GCP_COLUMNS = { + "country": "country", + "year": "year", + "emissions_total": "co2", + "emissions_total_per_capita": "co2_per_capita", + "traded_emissions": "trade_co2", + "emissions_from_cement": "cement_co2", + "emissions_from_cement_per_capita": "cement_co2_per_capita", + "emissions_from_coal": "coal_co2", + "emissions_from_coal_per_capita": "coal_co2_per_capita", + "emissions_from_flaring": "flaring_co2", + "emissions_from_flaring_per_capita": "flaring_co2_per_capita", + "emissions_from_gas": "gas_co2", + "emissions_from_gas_per_capita": "gas_co2_per_capita", + "emissions_from_oil": "oil_co2", + "emissions_from_oil_per_capita": "oil_co2_per_capita", + "emissions_from_other_industry": "other_industry_co2", + "emissions_from_other_industry_per_capita": "other_co2_per_capita", + "pct_growth_emissions_total": "co2_growth_prct", + "growth_emissions_total": "co2_growth_abs", + "emissions_total_per_gdp": "co2_per_gdp", + "emissions_total_per_unit_energy": "co2_per_unit_energy", + "consumption_emissions": "consumption_co2", + "consumption_emissions_per_capita": "consumption_co2_per_capita", + "consumption_emissions_per_gdp": "consumption_co2_per_gdp", + "cumulative_emissions_total": "cumulative_co2", + "cumulative_emissions_from_cement": "cumulative_cement_co2", + "cumulative_emissions_from_coal": "cumulative_coal_co2", + "cumulative_emissions_from_flaring": "cumulative_flaring_co2", + "cumulative_emissions_from_gas": "cumulative_gas_co2", + "cumulative_emissions_from_oil": "cumulative_oil_co2", + "cumulative_emissions_from_other_industry": "cumulative_other_co2", + "pct_traded_emissions": "trade_co2_share", + "emissions_total_as_share_of_global": "share_global_co2", + "emissions_from_cement_as_share_of_global": "share_global_cement_co2", + "emissions_from_coal_as_share_of_global": "share_global_coal_co2", + "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", + "emissions_from_gas_as_share_of_global": "share_global_gas_co2", + "emissions_from_oil_as_share_of_global": "share_global_oil_co2", + "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", + "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", + "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", + "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", + "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", + "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", + "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", + "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", + # New variables, related to land-use change emissions. + "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", + "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", + "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", + "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", + "emissions_from_land_use_change": "land_use_change_co2", + "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", + "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", + "emissions_total_including_land_use_change": "co2_including_luc", + "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", + "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", + "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", + "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", + "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", + "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", +} +JONES_COLUMNS = { + "country": "country", + "year": "year", + "temperature_response_co2_total": "temperature_change_from_co2", + "temperature_response_ghg_total": "temperature_change_from_ghg", + "temperature_response_ch4_total": "temperature_change_from_ch4", + "temperature_response_n2o_total": "temperature_change_from_n2o", + "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", +} +CLIMATE_WATCH_GHG_COLUMNS = { + "country": "country", + "year": "year", + "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf", + "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita", + "total_ghg_emissions_including_lucf": "total_ghg", + "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita", +} +CLIMATE_WATCH_CH4_COLUMNS = { + "country": "country", + "year": "year", + "total_ch4_emissions_including_lucf": "methane", + "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita", +} +CLIMATE_WATCH_N2O_COLUMNS = { + "country": "country", + "year": "year", + "total_n2o_emissions_including_lucf": "nitrous_oxide", + "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita", +} +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", + "primary_energy_consumption_per_capita__kwh": "energy_per_capita", + "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", +} +REGIONS_COLUMNS = { + "name": "country", + "iso_alpha3": "iso_code", +} +POPULATION_COLUMNS = { + "country": "country", + "year": "year", + "population": "population", +} +GDP_COLUMNS = { + "country": "country", + "year": "year", + "gdp": "gdp", +} + +UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}} + + +def convert_units(table: Table) -> Table: + """Convert units of table. + + Parameters + ---------- + table : Table + Data with its original units. + + Returns + ------- + Table + Data after converting units of specific columns. + + """ + table = table.copy() + # Check units and convert to more convenient ones. + for column in table.columns: + unit = table[column].metadata.unit + title = table[column].metadata.title + description_short = table[column].metadata.description or table[column].metadata.description_short + if unit in list(UNITS): + table[column] *= UNITS[unit]["conversion"] + table[column].metadata.unit = UNITS[unit]["new_unit"] + table[column].metadata.short_unit = UNITS[unit]["new_short_unit"] + table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"]) + table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"]) + + return table + + +def combine_tables( + tb_gcp: Table, + tb_jones: Table, + tb_climate_watch_ghg: Table, + tb_climate_watch_ch4: Table, + tb_climate_watch_n2o: Table, + tb_energy: Table, + tb_gdp: Table, + tb_population: Table, + tb_regions: Table, +) -> Table: + """Combine tables. + + Parameters + ---------- + tb_gcp : Table + Global Carbon Budget table (from Global Carbon Project). + tb_jones : Table + National contributions to climate change (from Jones et al. (2023)). + tb_climate_watch_ghg : Table + Greenhouse gas emissions table (from Climate Watch). + tb_climate_watch_ch4 : Table + CH4 emissions table (from Climate Watch). + tb_climate_watch_n2o : Table + N2O emissions table (from Climate Watch). + tb_energy : Table + Primary energy consumption table (from BP & EIA). + tb_gdp : Table + Maddison GDP table (from GGDC). + tb_population : Table + OWID population table (from various sources). + tb_regions : Table + OWID regions table. + + Returns + ------- + combined : Table + Combined table with metadata and variables metadata. + + """ + # Combine main tables (with an outer join, to gather all entities from all tables). + combined = tb_gcp.copy() + for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]: + combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Add secondary tables (with a left join, to keep only entities for which we have emissions data). + for table in [tb_energy, tb_gdp, tb_population]: + combined = combined.merge(table, on=["country", "year"], how="left") + + # Countries-regions dataset does not have a year column, so it has to be merged on country. + combined = combined.merge(tb_regions, on="country", how="left") + + # Check that there were no repetition in column names. + error = "Repeated columns in combined data." + assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error + + # Adjust units. + combined = convert_units(combined) + + return combined + + +def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table: + """Clean and prepare output table. + + Parameters + ---------- + combined : Table + Combined table. + ds_regions : Dataset + Regions dataset, only used to get its version. + + Returns + ------- + combined: Table + Cleaned combined table. + + """ + # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). + columns_that_must_have_data = [ + column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] + ] + combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) + + # Add metadata to the ISO column (loaded from the regions dataset). + combined["iso_code"].m.origins = [ + Origin( + producer="International Organization for Standardization", + title="Regions", + date_published=ds_regions.version, + ) + ] + combined["iso_code"].metadata.title = "ISO code" + combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes." + combined["iso_code"].metadata.unit = "" + + # Sanity check. + columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] + assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" + + # Sort rows and columns conveniently. + first_columns = ["country", "year", "iso_code", "population", "gdp"] + combined = combined[first_columns + [column for column in sorted(combined.columns) if column not in first_columns]] + + # Improve table format. + combined = combined.format() + + return combined + + +def remove_details_on_demand(text: str) -> str: + # Remove references to details on demand from a text. + # Example: "This is a [description](#dod:something)." -> "This is a description." + regex = r"\(\#dod\:.*\)" + if "(#dod:" in text: + text = re.sub(regex, "", text).replace("[", "").replace("]", "") + + return text + + +def prepare_codebook(tb: Table) -> pd.DataFrame: + table = tb.reset_index() + + # Manually create an origin for the regions dataset. + regions_origin = [Origin(producer="Our World in Data", title="Regions", date_published=str(table["year"].max()))] + + # Manually edit some of the metadata fields. + table["country"].metadata.title = "Country" + table["country"].metadata.description_short = "Geographic location." + table["country"].metadata.description = None + table["country"].metadata.unit = "" + table["country"].metadata.origins = regions_origin + table["year"].metadata.title = "Year" + table["year"].metadata.description_short = "Year of observation." + table["year"].metadata.description = None + table["year"].metadata.unit = "" + table["year"].metadata.origins = regions_origin + + #################################################################################################################### + if table["population"].metadata.description is None: + print("WARNING: Column population has no longer a description field. Remove this part of the code") + else: + table["population"].metadata.description = None + + #################################################################################################################### + + # Gather column names, titles, short descriptions, unit and origins from the indicators' metadata. + metadata = {"column": [], "description": [], "unit": [], "source": []} + for column in table.columns: + metadata["column"].append(column) + + if hasattr(table[column].metadata, "description") and table[column].metadata.description is not None: + print(f"WARNING: Column {column} still has a 'description' field.") + # Prepare indicator's description. + description = "" + if ( + hasattr(table[column].metadata.presentation, "title_public") + and table[column].metadata.presentation.title_public is not None + ): + description += table[column].metadata.presentation.title_public + else: + description += table[column].metadata.title + if table[column].metadata.description_short: + description += f" - {table[column].metadata.description_short}" + description = remove_details_on_demand(description) + metadata["description"].append(description) + + # Prepare indicator's unit. + if table[column].metadata.unit is None: + print(f"WARNING: Column {column} does not have a unit.") + unit = "" + else: + unit = table[column].metadata.unit + metadata["unit"].append(unit) + + # Gather unique origins of current variable. + unique_sources = [] + for origin in table[column].metadata.origins: + # Construct the source name from the origin's attribution. + # If not defined, build it using the default format "Producer - Data product (year)". + source_name = ( + origin.attribution + or f"{origin.producer} - {origin.title or origin.title_snapshot} ({origin.date_published.split('-')[0]})" + ) + + # Add url at the end of the source. + if origin.url_main: + source_name += f" [{origin.url_main}]" + + # Add the source to the list of unique sources. + if source_name not in unique_sources: + unique_sources.append(source_name) + + # Concatenate all sources. + sources_combined = "; ".join(unique_sources) + metadata["source"].append(sources_combined) + + # Create a dataframe with the gathered metadata and sort conveniently by column name. + codebook = pd.DataFrame(metadata).set_index("column").sort_index() + # For clarity, ensure column descriptions are in the same order as the columns in the data. + first_columns = ["country", "year", "iso_code", "population", "gdp"] + codebook = pd.concat([codebook.loc[first_columns], codebook.drop(first_columns, errors="raise")]).reset_index() + # Create a table with the appropriate metadata. + codebook = Table(codebook).format( + keys=["column"], sort_rows=False, sort_columns=False, short_name="owid_co2_codebook" + ) + codebook_origin = [ + Origin(producer="Our World in Data", title="CO2-data codebook", date_published=str(table["year"].max())) + ] + for column in ["description", "unit", "source"]: + codebook[column].metadata.origins = codebook_origin + + return codebook + + +def sanity_check_outputs(tb: Table, tb_codebook: Table) -> None: + error = "Dataset columns should coincide with the codebook 'columns'." + assert set(tb_codebook.reset_index()["column"]) == set(tb.reset_index().columns), error + + error = "All rows in dataset should contain at least one non-NaN value." + assert not tb.isnull().all(axis=1).any(), error + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the global carbon budget dataset from the Global Carbon Project (GCP). + ds_gcp = paths.load_dataset("global_carbon_budget") + + # Load the Jones et al. (2023) dataset on national contributions to climate change. + ds_jones = paths.load_dataset("national_contributions") + + # Load the greenhouse gas emissions by sector dataset by Climate Watch. + ds_climate_watch = paths.load_dataset("emissions_by_sector") + + # Load the GDP dataset by GGDC Maddison. + ds_gdp = paths.load_dataset("maddison_project_database") + + # Load primary energy consumption dataset (by different sources in our 'energy' namespace). + ds_energy = paths.load_dataset("primary_energy_consumption") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load countries-regions dataset (required to get ISO codes). + ds_regions = paths.load_dataset("regions") + + # Gather all required tables from all datasets. + tb_gcp = ds_gcp["global_carbon_budget"] + tb_jones = ds_jones["national_contributions"] + tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"] + tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"] + tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"] + tb_energy = ds_energy["primary_energy_consumption"] + tb_gdp = ds_gdp["maddison_project_database"] + tb_population = ds_population["population"] + tb_regions = ds_regions["regions"] + + # + # Process data. + # + # Choose required columns and rename them. + tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") + tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") + tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename( + columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise" + ) + tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename( + columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise" + ) + tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename( + columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise" + ) + tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( + columns=PRIMARY_ENERGY_COLUMNS, errors="raise" + ) + tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") + tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( + columns=POPULATION_COLUMNS, errors="raise" + ) + tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") + + # Combine tables. + combined = combine_tables( + tb_gcp=tb_gcp, + tb_jones=tb_jones, + tb_climate_watch_ghg=tb_climate_watch_ghg, + tb_climate_watch_ch4=tb_climate_watch_ch4, + tb_climate_watch_n2o=tb_climate_watch_n2o, + tb_energy=tb_energy, + tb_gdp=tb_gdp, + tb_population=tb_population, + tb_regions=tb_regions, + ) + + # Prepare output data table. + tb = prepare_outputs(combined=combined, ds_regions=ds_regions) + + # Prepare codebook. + tb_codebook = prepare_codebook(tb=tb) + + # Sanity check. + sanity_check_outputs(tb=tb, tb_codebook=tb_codebook) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb, tb_codebook], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.meta.yml b/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.meta.yml new file mode 100644 index 00000000000..7d8a0eb20e4 --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.meta.yml @@ -0,0 +1,3 @@ +dataset: + title: Decoupling of GDP and CO2 emissions + update_period_days: 365 diff --git a/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.py b/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.py new file mode 100644 index 00000000000..c0de7bc239f --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-21/gdp_and_co2_decoupling.py @@ -0,0 +1,158 @@ +"""This step takes the Global Carbon Budget and GDP data from World Bank's World Development Indicators, and creates a +dataset with the changes in emissions and GDP over time. + +We already have an interactive chart showing similar data, +for per capita GDP and per capita, consumption-based CO2 emissions: +https://ourworldindata.org/grapher/co2-emissions-and-gdp + +The data in the current step is not used by any grapher step, but will be used by the following static chart: + +The data from this step is used in this static chart: +https://drive.google.com/file/d/1PflfQpr4mceVWRSGEqMP6Gbo1tFQZzOp/view?usp=sharing + +""" + +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# First and final years whose (per capita) GDP and emissions will be compared. +START_YEAR = 2006 +END_YEAR = 2021 + +# Columns to select from WDI, and how to rename them. +COLUMNS_WDI = { + "country": "country", + "year": "year", + # GDP, PPP (constant 2017 international $) + # "ny_gdp_mktp_pp_kd": "gdp", + # GDP per capita, PPP (constant 2017 international $) + "ny_gdp_pcap_pp_kd": "gdp_per_capita", +} + +# Columns to select from GCB, and how to rename them. +COLUMNS_GCB = { + "country": "country", + "year": "year", + # "emissions_total": "production_emissions", + # "emissions_total_per_capita": "production_emissions_per_capita", + # "consumption_emissions": "consumption_emissions", + "consumption_emissions_per_capita": "consumption_emissions_per_capita", + # 'emissions_total_including_land_use_change': "", + # 'emissions_total_including_land_use_change_per_capita': "", +} + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load Global Carbon Budget dataset and read its main table. + ds_gcb = paths.load_dataset("global_carbon_budget") + tb_gcb = ds_gcb["global_carbon_budget"].reset_index() + + # Load WDI dataset, read its main table. + ds_wdi = paths.load_dataset("wdi") + tb_wdi = ds_wdi["wdi"].reset_index() + + # + # Process data. + # + # Select and rename the required variables from GCB. + tb_gcb = tb_gcb[list(COLUMNS_GCB)].rename(columns=COLUMNS_GCB, errors="raise") + + # Select and rename the required variables from WDI. + tb_wdi = tb_wdi[list(COLUMNS_WDI)].rename(columns=COLUMNS_WDI, errors="raise") + + # Combine both tables. + tb = tb_gcb.merge(tb_wdi, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Define list of non-index columns. + data_columns = [column for column in tb.columns if column not in ["country", "year"]] + + # Remove empty rows. + tb = tb.dropna(subset=data_columns, how="all").reset_index(drop=True) + + # Select years between START_YEAR and END_YEAR. + tb_start = tb[(tb["year"] == START_YEAR)].reset_index(drop=True) + + # Select data for all countries at the final year. + tb_end = tb[tb["year"] == END_YEAR].reset_index(drop=True) + + # Add columns for data on the final year to the main table. + tb = tb_start.merge(tb_end, on="country", how="left", suffixes=("_start_year", "_final_year")) + + # Add percent changes. + for column in data_columns: + tb[f"{column}_change"] = ( + (tb[f"{column}_final_year"] - tb[f"{column}_start_year"]) / tb[f"{column}_start_year"] * 100 + ) + + # Remove unnecessary columns. + tb = tb.drop(columns=[column for column in tb.columns if "year" in column]) + + # Drop rows that miss any of the main columns. + tb = tb.dropna(how="any").reset_index(drop=True) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["country"], verify_integrity=True).sort_index() + + # + # Save outputs. + # + # Create a new garden dataset. + ds_garden = create_dataset(dest_dir=dest_dir, tables=[tb], check_variables_metadata=True, formats=["csv"]) + ds_garden.save() + + +# To quickly inspect the decoupling of GDP per capita vs consumption-based emissions per capita, use this function. +# def plot_decoupling(tb, countries=None): +# import plotly.express as px +# import owid.catalog.processing as pr +# from tqdm.auto import tqdm + +# column = "gdp_per_capita_change" +# emissions_column = "consumption_emissions_per_capita_change" +# _tb = tb.reset_index().astype({"country": str})[["country", column, emissions_column]] +# _tb["year"] = START_YEAR +# if countries is None: +# countries = sorted(set(_tb["country"])) +# for country in tqdm(countries): +# tb_old = _tb[_tb["country"] == country].reset_index(drop=True) +# if (tb_old[emissions_column].isna().all()) or (tb_old[column].isna().all()): +# continue +# title = tb_old[column].metadata.title or column +# tb_new = tb_old.copy() +# tb_new["year"] = END_YEAR +# tb_old[column] = 0 +# tb_old[emissions_column] = 0 +# tb_plot = pr.concat([tb_old, tb_new], ignore_index=True) +# tb_plot = tb_plot.melt(id_vars=["country", "year"], var_name="Indicator") +# plot = px.line(tb_plot, x="year", y="value", color="Indicator", title=f"{country} - {title}") +# plot.show() + +# List of countries currently considered for the static chart: +# countries = ["Ireland", "Finland", "Sweden", "Denmark", "Netherlands", "Estonia", "United States", "Canada", "Germany", +# "Belgium", "New Zealand", "Israel", "Japan", "Singapore", "Dominican Republic", "Hungary", "Australia", "Zimbabwe", +# "Ukraine", "Bulgaria", "Switzerland", "Hong Kong", "Slovakia", "Romania", "Czechia", "Nicaragua", "Nigeria", +# "Azerbaijan", "Slovenia", "Croatia"] +# Check that the chosen countries still fulfil the expected conditions. +# print("Countries in the list where GDP has increased less than 5% or emissions have decreased less than 5%:") +# for c in countries: +# if not tb.loc[c]["consumption_emissions_per_capita_change"] < -5: +# print("emissions", c, tb.loc[c]["consumption_emissions_per_capita_change"]) +# if not tb.loc[c]["gdp_per_capita_change"] > 5: +# print("gdp", c, tb.loc[c]["gdp_per_capita_change"]) + +# If not, print other countries that do fulfil the conditions and are not in the chart. +# other_countries = sorted(set(tb.index) - set(countries)) +# for c in other_countries: +# if (tb.loc[c]["consumption_emissions_per_capita_change"] < -5) and (tb.loc[c]["gdp_per_capita_change"] > 5): +# print(c, f' -> GDP: {tb.loc[c]["gdp_per_capita_change"]: .1f}%, Emissions: {tb.loc[c]["consumption_emissions_per_capita_change"]:.1f}%') + +# plot_decoupling(tb, countries=countries) diff --git a/etl/steps/data/garden/emissions/2024-11-21/owid_co2.py b/etl/steps/data/garden/emissions/2024-11-21/owid_co2.py new file mode 100644 index 00000000000..c93ad47b92b --- /dev/null +++ b/etl/steps/data/garden/emissions/2024-11-21/owid_co2.py @@ -0,0 +1,490 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. + +Datasets combined: +* Global Carbon Budget - Global Carbon Project. +* National contributions to climate change - Jones et al. +* Greenhouse gas emissions by sector - Climate Watch. +* Primary energy consumption - EI & EIA. + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2023) on +GDP are included. + +""" + +import re + +import numpy as np +import pandas as pd +from owid.catalog import Dataset, Origin, Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Conversion factor from tonnes to million tonnes. +TONNES_TO_MILLION_TONNES = 1e-6 + +# Select columns to use from each dataset, and how to rename them. +GCP_COLUMNS = { + "country": "country", + "year": "year", + "emissions_total": "co2", + "emissions_total_per_capita": "co2_per_capita", + "traded_emissions": "trade_co2", + "emissions_from_cement": "cement_co2", + "emissions_from_cement_per_capita": "cement_co2_per_capita", + "emissions_from_coal": "coal_co2", + "emissions_from_coal_per_capita": "coal_co2_per_capita", + "emissions_from_flaring": "flaring_co2", + "emissions_from_flaring_per_capita": "flaring_co2_per_capita", + "emissions_from_gas": "gas_co2", + "emissions_from_gas_per_capita": "gas_co2_per_capita", + "emissions_from_oil": "oil_co2", + "emissions_from_oil_per_capita": "oil_co2_per_capita", + "emissions_from_other_industry": "other_industry_co2", + "emissions_from_other_industry_per_capita": "other_co2_per_capita", + "pct_growth_emissions_total": "co2_growth_prct", + "growth_emissions_total": "co2_growth_abs", + "emissions_total_per_gdp": "co2_per_gdp", + "emissions_total_per_unit_energy": "co2_per_unit_energy", + "consumption_emissions": "consumption_co2", + "consumption_emissions_per_capita": "consumption_co2_per_capita", + "consumption_emissions_per_gdp": "consumption_co2_per_gdp", + "cumulative_emissions_total": "cumulative_co2", + "cumulative_emissions_from_cement": "cumulative_cement_co2", + "cumulative_emissions_from_coal": "cumulative_coal_co2", + "cumulative_emissions_from_flaring": "cumulative_flaring_co2", + "cumulative_emissions_from_gas": "cumulative_gas_co2", + "cumulative_emissions_from_oil": "cumulative_oil_co2", + "cumulative_emissions_from_other_industry": "cumulative_other_co2", + "pct_traded_emissions": "trade_co2_share", + "emissions_total_as_share_of_global": "share_global_co2", + "emissions_from_cement_as_share_of_global": "share_global_cement_co2", + "emissions_from_coal_as_share_of_global": "share_global_coal_co2", + "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", + "emissions_from_gas_as_share_of_global": "share_global_gas_co2", + "emissions_from_oil_as_share_of_global": "share_global_oil_co2", + "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", + "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", + "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", + "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", + "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", + "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", + "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", + "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", + # New variables, related to land-use change emissions. + "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", + "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", + "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", + "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", + "emissions_from_land_use_change": "land_use_change_co2", + "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", + "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", + "emissions_total_including_land_use_change": "co2_including_luc", + "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", + "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", + "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", + "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", + "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", + "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", +} +JONES_COLUMNS = { + "country": "country", + "year": "year", + "temperature_response_co2_total": "temperature_change_from_co2", + "temperature_response_ghg_total": "temperature_change_from_ghg", + "temperature_response_ch4_total": "temperature_change_from_ch4", + "temperature_response_n2o_total": "temperature_change_from_n2o", + "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", +} +CLIMATE_WATCH_GHG_COLUMNS = { + "country": "country", + "year": "year", + "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf", + "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita", + "total_ghg_emissions_including_lucf": "total_ghg", + "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita", +} +CLIMATE_WATCH_CH4_COLUMNS = { + "country": "country", + "year": "year", + "total_ch4_emissions_including_lucf": "methane", + "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita", +} +CLIMATE_WATCH_N2O_COLUMNS = { + "country": "country", + "year": "year", + "total_n2o_emissions_including_lucf": "nitrous_oxide", + "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita", +} +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", + "primary_energy_consumption_per_capita__kwh": "energy_per_capita", + "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", +} +REGIONS_COLUMNS = { + "name": "country", + "iso_alpha3": "iso_code", +} +POPULATION_COLUMNS = { + "country": "country", + "year": "year", + "population": "population", +} +GDP_COLUMNS = { + "country": "country", + "year": "year", + "gdp": "gdp", +} + +UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}} + + +def convert_units(table: Table) -> Table: + """Convert units of table. + + Parameters + ---------- + table : Table + Data with its original units. + + Returns + ------- + Table + Data after converting units of specific columns. + + """ + table = table.copy() + # Check units and convert to more convenient ones. + for column in table.columns: + unit = table[column].metadata.unit + title = table[column].metadata.title + description_short = table[column].metadata.description or table[column].metadata.description_short + if unit in list(UNITS): + table[column] *= UNITS[unit]["conversion"] + table[column].metadata.unit = UNITS[unit]["new_unit"] + table[column].metadata.short_unit = UNITS[unit]["new_short_unit"] + table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"]) + table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"]) + + return table + + +def combine_tables( + tb_gcp: Table, + tb_jones: Table, + tb_climate_watch_ghg: Table, + tb_climate_watch_ch4: Table, + tb_climate_watch_n2o: Table, + tb_energy: Table, + tb_gdp: Table, + tb_population: Table, + tb_regions: Table, +) -> Table: + """Combine tables. + + Parameters + ---------- + tb_gcp : Table + Global Carbon Budget table (from Global Carbon Project). + tb_jones : Table + National contributions to climate change (from Jones et al. (2023)). + tb_climate_watch_ghg : Table + Greenhouse gas emissions table (from Climate Watch). + tb_climate_watch_ch4 : Table + CH4 emissions table (from Climate Watch). + tb_climate_watch_n2o : Table + N2O emissions table (from Climate Watch). + tb_energy : Table + Primary energy consumption table (from BP & EIA). + tb_gdp : Table + Maddison GDP table (from GGDC). + tb_population : Table + OWID population table (from various sources). + tb_regions : Table + OWID regions table. + + Returns + ------- + combined : Table + Combined table with metadata and variables metadata. + + """ + # Combine main tables (with an outer join, to gather all entities from all tables). + combined = tb_gcp.copy() + for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]: + combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) + + # Add secondary tables (with a left join, to keep only entities for which we have emissions data). + for table in [tb_energy, tb_gdp, tb_population]: + combined = combined.merge(table, on=["country", "year"], how="left") + + # Countries-regions dataset does not have a year column, so it has to be merged on country. + combined = combined.merge(tb_regions, on="country", how="left") + + # Check that there were no repetition in column names. + error = "Repeated columns in combined data." + assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error + + # Adjust units. + combined = convert_units(combined) + + return combined + + +def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table: + """Clean and prepare output table. + + Parameters + ---------- + combined : Table + Combined table. + ds_regions : Dataset + Regions dataset, only used to get its version. + + Returns + ------- + combined: Table + Cleaned combined table. + + """ + # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). + columns_that_must_have_data = [ + column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] + ] + combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) + + # Add metadata to the ISO column (loaded from the regions dataset). + combined["iso_code"].m.origins = [ + Origin( + producer="International Organization for Standardization", + title="Regions", + date_published=ds_regions.version, + ) + ] + combined["iso_code"].metadata.title = "ISO code" + combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes." + combined["iso_code"].metadata.unit = "" + + # Sanity check. + columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] + assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" + + # Sort rows and columns conveniently. + first_columns = ["country", "year", "iso_code", "population", "gdp"] + combined = combined[first_columns + [column for column in sorted(combined.columns) if column not in first_columns]] + + # Improve table format. + combined = combined.format() + + return combined + + +def remove_details_on_demand(text: str) -> str: + # Remove references to details on demand from a text. + # Example: "This is a [description](#dod:something)." -> "This is a description." + regex = r"\(\#dod\:.*\)" + if "(#dod:" in text: + text = re.sub(regex, "", text).replace("[", "").replace("]", "") + + return text + + +def prepare_codebook(tb: Table) -> pd.DataFrame: + table = tb.reset_index() + + # Manually create an origin for the regions dataset. + regions_origin = [Origin(producer="Our World in Data", title="Regions", date_published=str(table["year"].max()))] + + # Manually edit some of the metadata fields. + table["country"].metadata.title = "Country" + table["country"].metadata.description_short = "Geographic location." + table["country"].metadata.description = None + table["country"].metadata.unit = "" + table["country"].metadata.origins = regions_origin + table["year"].metadata.title = "Year" + table["year"].metadata.description_short = "Year of observation." + table["year"].metadata.description = None + table["year"].metadata.unit = "" + table["year"].metadata.origins = regions_origin + + #################################################################################################################### + if table["population"].metadata.description is None: + print("WARNING: Column population has no longer a description field. Remove this part of the code") + else: + table["population"].metadata.description = None + + #################################################################################################################### + + # Gather column names, titles, short descriptions, unit and origins from the indicators' metadata. + metadata = {"column": [], "description": [], "unit": [], "source": []} + for column in table.columns: + metadata["column"].append(column) + + if hasattr(table[column].metadata, "description") and table[column].metadata.description is not None: + print(f"WARNING: Column {column} still has a 'description' field.") + # Prepare indicator's description. + description = "" + if ( + hasattr(table[column].metadata.presentation, "title_public") + and table[column].metadata.presentation.title_public is not None + ): + description += table[column].metadata.presentation.title_public + else: + description += table[column].metadata.title + if table[column].metadata.description_short: + description += f" - {table[column].metadata.description_short}" + description = remove_details_on_demand(description) + metadata["description"].append(description) + + # Prepare indicator's unit. + if table[column].metadata.unit is None: + print(f"WARNING: Column {column} does not have a unit.") + unit = "" + else: + unit = table[column].metadata.unit + metadata["unit"].append(unit) + + # Gather unique origins of current variable. + unique_sources = [] + for origin in table[column].metadata.origins: + # Construct the source name from the origin's attribution. + # If not defined, build it using the default format "Producer - Data product (year)". + source_name = ( + origin.attribution + or f"{origin.producer} - {origin.title or origin.title_snapshot} ({origin.date_published.split('-')[0]})" + ) + + # Add url at the end of the source. + if origin.url_main: + source_name += f" [{origin.url_main}]" + + # Add the source to the list of unique sources. + if source_name not in unique_sources: + unique_sources.append(source_name) + + # Concatenate all sources. + sources_combined = "; ".join(unique_sources) + metadata["source"].append(sources_combined) + + # Create a dataframe with the gathered metadata and sort conveniently by column name. + codebook = pd.DataFrame(metadata).set_index("column").sort_index() + # For clarity, ensure column descriptions are in the same order as the columns in the data. + first_columns = ["country", "year", "iso_code", "population", "gdp"] + codebook = pd.concat([codebook.loc[first_columns], codebook.drop(first_columns, errors="raise")]).reset_index() + # Create a table with the appropriate metadata. + codebook = Table(codebook).format( + keys=["column"], sort_rows=False, sort_columns=False, short_name="owid_co2_codebook" + ) + codebook_origin = [ + Origin(producer="Our World in Data", title="CO2-data codebook", date_published=str(table["year"].max())) + ] + for column in ["description", "unit", "source"]: + codebook[column].metadata.origins = codebook_origin + + return codebook + + +def sanity_check_outputs(tb: Table, tb_codebook: Table) -> None: + error = "Dataset columns should coincide with the codebook 'columns'." + assert set(tb_codebook.reset_index()["column"]) == set(tb.reset_index().columns), error + + error = "All rows in dataset should contain at least one non-NaN value." + assert not tb.isnull().all(axis=1).any(), error + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the global carbon budget dataset from the Global Carbon Project (GCP). + ds_gcp = paths.load_dataset("global_carbon_budget") + + # Load the Jones et al. (2023) dataset on national contributions to climate change. + ds_jones = paths.load_dataset("national_contributions") + + # Load the greenhouse gas emissions by sector dataset by Climate Watch. + ds_climate_watch = paths.load_dataset("emissions_by_sector") + + # Load the GDP dataset by GGDC Maddison. + ds_gdp = paths.load_dataset("maddison_project_database") + + # Load primary energy consumption dataset (by different sources in our 'energy' namespace). + ds_energy = paths.load_dataset("primary_energy_consumption") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load countries-regions dataset (required to get ISO codes). + ds_regions = paths.load_dataset("regions") + + # Gather all required tables from all datasets. + tb_gcp = ds_gcp["global_carbon_budget"] + tb_jones = ds_jones["national_contributions"] + tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"] + tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"] + tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"] + tb_energy = ds_energy["primary_energy_consumption"] + tb_gdp = ds_gdp["maddison_project_database"] + tb_population = ds_population["population"] + tb_regions = ds_regions["regions"] + + # + # Process data. + # + # Choose required columns and rename them. + tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") + tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") + tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename( + columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise" + ) + tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename( + columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise" + ) + tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename( + columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise" + ) + tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( + columns=PRIMARY_ENERGY_COLUMNS, errors="raise" + ) + tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") + tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( + columns=POPULATION_COLUMNS, errors="raise" + ) + tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") + + # Combine tables. + combined = combine_tables( + tb_gcp=tb_gcp, + tb_jones=tb_jones, + tb_climate_watch_ghg=tb_climate_watch_ghg, + tb_climate_watch_ch4=tb_climate_watch_ch4, + tb_climate_watch_n2o=tb_climate_watch_n2o, + tb_energy=tb_energy, + tb_gdp=tb_gdp, + tb_population=tb_population, + tb_regions=tb_regions, + ) + + # Prepare output data table. + tb = prepare_outputs(combined=combined, ds_regions=ds_regions) + + # Prepare codebook. + tb_codebook = prepare_codebook(tb=tb) + + # Sanity check. + sanity_check_outputs(tb=tb, tb_codebook=tb_codebook) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb, tb_codebook], check_variables_metadata=True) + ds_grapher.save() diff --git a/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.countries.json b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.countries.json new file mode 100644 index 00000000000..ac727db78bf --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.countries.json @@ -0,0 +1,283 @@ +{ + "Afghanistan": "Afghanistan", + "Africa": "Africa (GCP)", + "Albania": "Albania", + "Algeria": "Algeria", + "American Samoa": "American Samoa", + "Andorra": "Andorra", + "Angola": "Angola", + "Anguilla": "Anguilla", + "Antarctica": "Antarctica", + "Antigua and Barbuda": "Antigua and Barbuda", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Aruba": "Aruba", + "Asia": "Asia (GCP)", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahamas": "Bahamas", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Barbados": "Barbados", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Belize": "Belize", + "Benin": "Benin", + "Bermuda": "Bermuda", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bolivia (Plurinational State of)": "Bolivia", + "Bonaire, Saint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bonaire, Sint Eustatius and Saba": "Bonaire Sint Eustatius and Saba", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "British Virgin Islands": "British Virgin Islands", + "Brunei Darussalam": "Brunei", + "Bulgaria": "Bulgaria", + "Bunkers": "International transport", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Central America": "Central America (GCP)", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Christmas Island": "Christmas Island", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo": "Congo", + "Congo, Democratic Republic of the": "Democratic Republic of Congo", + "Cook Islands": "Cook Islands", + "Costa Rica": "Costa Rica", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cura\u00e7ao": "Curacao", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Czechia": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominica": "Dominica", + "Dominican Republic": "Dominican Republic", + "EU27": "European Union (27) (GCP)", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Europe": "Europe (GCP)", + "Faeroe Islands": "Faroe Islands", + "Falkland Islands (Malvinas)": "Falkland Islands", + "Faroe Islands": "Faroe Islands", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "French Equatorial Africa": "French Equatorial Africa (GCP)", + "French Guiana": "French Guiana", + "French Polynesia": "French Polynesia", + "French West Africa": "French West Africa (GCP)", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Global": "World", + "Greece": "Greece", + "Greenland": "Greenland", + "Grenada": "Grenada", + "Guadeloupe": "Guadeloupe", + "Guatemala": "Guatemala", + "Guernsey": "Guernsey", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "International Aviation": "International aviation", + "International Shipping": "International shipping", + "Iran": "Iran", + "Iran (Islamic Republic of)": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Isle of Man": "Isle of Man", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jersey": "Jersey", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kiribati": "Kiribati", + "Korea (Democratic People's Republic of)": "North Korea", + "Korea, Republic of": "South Korea", + "Kosovo": "Kosovo", + "Kuwait": "Kuwait", + "Kuwaiti Oil Fires": "Kuwaiti Oil Fires (GCP)", + "Kyrgyzstan": "Kyrgyzstan", + "Lao People's Democratic Republic": "Laos", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Leeward Islands": "Leeward Islands (GCP)", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Liechtenstein": "Liechtenstein", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macao": "Macao", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Maldives": "Maldives", + "Mali": "Mali", + "Malta": "Malta", + "Marshall Islands": "Marshall Islands", + "Martinique": "Martinique", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mayotte": "Mayotte", + "Mexico": "Mexico", + "Micronesia (Federated States of)": "Micronesia (country)", + "Middle East": "Middle East (GCP)", + "Moldova": "Moldova", + "Moldova, Republic of": "Moldova", + "Monaco": "Monaco", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Montserrat": "Montserrat", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nauru": "Nauru", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "Netherlands Antilles": "Netherlands Antilles", + "New Caledonia": "New Caledonia", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "Niue": "Niue", + "Non-OECD": "Non-OECD (GCP)", + "North America": "North America (GCP)", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "OECD": "OECD (GCP)", + "Occupied Palestinian Territory": "Palestine", + "Oceania": "Oceania (GCP)", + "Oman": "Oman", + "Pacific Islands (Palau)": "Palau", + "Pakistan": "Pakistan", + "Palau": "Palau", + "Palestine, State of": "Palestine", + "Panama": "Panama", + "Panama Canal Zone": "Panama Canal Zone (GCP)", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Puerto Rico": "Puerto Rico", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Russian Federation": "Russia", + "Rwanda": "Rwanda", + "Ryukyu Islands": "Ryukyu Islands (GCP)", + "R\u00e9union": "Reunion", + "Saint Helena": "Saint Helena", + "Saint Kitts and Nevis": "Saint Kitts and Nevis", + "Saint Lucia": "Saint Lucia", + "Saint Martin (French part)": "Saint Martin (French part)", + "Saint Pierre and Miquelon": "Saint Pierre and Miquelon", + "Saint Vincent and the Grenadines": "Saint Vincent and the Grenadines", + "Samoa": "Samoa", + "San Marino": "San Marino", + "Sao Tome and Principe": "Sao Tome and Principe", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Seychelles": "Seychelles", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Sint Maarten (Dutch part)": "Sint Maarten (Dutch part)", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "Solomon Islands": "Solomon Islands", + "Somalia": "Somalia", + "South Africa": "South Africa", + "South America": "South America (GCP)", + "South Korea": "South Korea", + "South Sudan": "South Sudan", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "State of Palestine": "Palestine", + "St. Kitts-Nevis-Anguilla": "St. Kitts-Nevis-Anguilla (GCP)", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Svalbard and Jan Mayen": "Svalbard and Jan Mayen", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Syrian Arab Republic": "Syria", + "Taiwan": "Taiwan", + "Taiwan, Province of China": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Tanzania, United Republic of": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Tonga": "Tonga", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Türkiye": "Turkey", + "Turkmenistan": "Turkmenistan", + "Turks and Caicos Islands": "Turks and Caicos Islands", + "Tuvalu": "Tuvalu", + "USA": "United States", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United Kingdom of Great Britain and Northern Ireland": "United Kingdom", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Vatican City": "Vatican", + "Vanuatu": "Vanuatu", + "Venezuela": "Venezuela", + "Venezuela (Bolivarian Republic of)": "Venezuela", + "Viet Nam": "Vietnam", + "Virgin Islands (U.S.)": "United States Virgin Islands", + "Wallis and Futuna Islands": "Wallis and Futuna", + "Western Sahara": "Western Sahara", + "World": "World", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "\u00c5land Islands": "Aland Islands" +} diff --git a/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.excluded_countries.json b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.excluded_countries.json new file mode 100644 index 00000000000..6ad8ec106f5 --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.excluded_countries.json @@ -0,0 +1,7 @@ +[ + "KP Annex B", + "Non KP Annex B", + "DISPUTED", + "OTHER", + "EU27" +] diff --git a/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.meta.yml b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.meta.yml new file mode 100644 index 00000000000..9d9470af087 --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.meta.yml @@ -0,0 +1,514 @@ +definitions: + production_emissions_description_key: &production_emissions_description_key + - This data is based on territorial emissions, which do not account for emissions embedded in traded goods. + traded_emissions_description_key: &traded_emissions_description_key + - Net CO₂ emissions embedded in trade is the net of CO₂ which is imported or exported via traded goods with an economy. A positive value denotes a country or region is a net importer of CO₂ emissions; a negative value indicates a country is a net exporter. + international_aviation_description_key: &international_aviation_description_key + - Emissions from international aviation and shipping are not included in any country or region's emissions. They are only included in the global total emissions. + consumption_emissions_description_key: &consumption_emissions_description_key + - Consumption-based emissions attribute the emissions generated in the production of goods and services according to where they were _consumed_, rather than where they were _produced_. + - "The data is calculated by adjusting 'production-based' emissions (emissions produced domestically) for trade: Consumption-based emissions equals production-based emissions, _minus_ emissions embedded in exports, _plus_ emissions embedded in imports." + - If a country's consumption-based emissions are higher than its production emissions it is a net importer of carbon dioxide. If its consumption-based emissions are lower, then it is a net exporter. + - Consumption-based emissions are not available for all countries because not all countries have sufficient, high-quality trade data. But those without complete data are a small fraction (3%) of the global total. + - This data measures carbon dioxide (CO₂) emissions from fossil fuels and industry and does not include emissions from land use change, deforestation, soils, or vegetation. + per_capita_description_key: &per_capita_description_key + - Per capita emissions represent the emissions of an average person in a country or region - they are calculated as the total emissions divided by population. + # Common fields to be used in all indicators (unless overridden for specific indicators below). + common: + description_processing: &description_processing | + - Data on global emissions has been converted from tonnes of carbon to tonnes of carbon dioxide (CO₂) using a conversion factor of 3.664. + - Emissions from the Kuwaiti oil fires in 1991 have been included as part of Kuwait's emissions for that year. + - Country's share of the global population is calculated using our population dataset, based on [different sources](https://ourworldindata.org/population-sources). + - Each country's share of global CO₂ emissions from flaring has been calculated using global CO₂ emissions from flaring provided in the Global Carbon Budget dataset. + description_key: + # NOTE: The description key points are re-defined for each indicator on consumption-based emissions and traded emissions, as well as on per-capita indicators. + - *production_emissions_description_key + - *international_aviation_description_key + presentation: + topic_tags: + - CO2 & Greenhouse Gas Emissions + attribution_short: GCB + processing_level: major + +dataset: + title: Global Carbon Budget + update_period_days: 365 + +tables: + global_carbon_budget: + variables: + consumption_emissions: + title: "Annual consumption-based CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes. + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + consumption_emissions_as_share_of_global: + title: "Share of global annual CO₂ consumption-based emissions" + unit: "%" + short_unit: "%" + description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured as a percentage of global consumption-based emissions of CO₂ in the same year." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + ################################################################################################################## + # Curated indicator for data page. + consumption_emissions_per_capita: + title: Per capita consumption-based CO₂ emissions + description_short: | + Annual consumption-based emissions of carbon dioxide (CO₂), measured in tonnes per person. + description_key: + - *consumption_emissions_description_key + - *per_capita_description_key + - *international_aviation_description_key + description_processing: *description_processing + unit: tonnes per person + short_unit: t/person + display: + shortUnit: t + numDecimalPlaces: 0 + presentation: + attribution_short: Global Carbon Project + topic_tags: + - CO2 & Greenhouse Gas Emissions + - Climate Change + - Energy + faqs: + - fragment_id: emissions-from-aviation-and-shipping + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + - fragment_id: missing-consumption-based-emissions + gdoc_id: 1gGburArxglFdHXeTLotFW4TOOLoeRq5XW6UfAdKtaAw + grapher_config: + subtitle: >- + [Consumption-based emissions](#dod:consumptionbasedemissions) are national + emissions that have been adjusted for trade. It's production-based emissions + minus emissions embedded in exports, plus emissions embedded in imports. + hideAnnotationFieldsInTitle: + time: true + entity: true + changeInPrefix: true + hideRelativeToggle: false + hasMapTab: true + tab: map + originUrl: https://ourworldindata.org/co2-and-greenhouse-gas-emissions + colorScale: + binningStrategy: equalInterval + map: + colorScale: + baseColorScheme: Reds + binningStrategy: manual + customNumericValues: + - 1 + - 2 + - 5 + - 10 + - 20 + - 50 + customNumericColors: + - null + - null + selectedEntityNames: + - United States + - United Kingdom + - European Union (27) + - China + - India + - Australia + - Brazil + - South Africa + relatedQuestions: + - url: https://ourworldindata.org/grapher/consumption-co2-per-capita#faqs + text: FAQs on this data + consumption_emissions_per_gdp: + title: "Annual consumption-based CO₂ emissions per GDP (kg per international-$)" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual consumption-based emissions of carbon dioxide (CO₂), measured in kilograms per dollar of GDP (2011 international-$)." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_consumption_emissions: + title: "Cumulative CO₂ consumption-based emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured in tonnes." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_consumption_emissions_as_share_of_global: + title: "Share of global cumulative CO₂ consumption-based emissions" + unit: "%" + short_unit: "%" + description_short: "Cumulative consumption-based emissions of carbon dioxide (CO₂) since the first year of available data, measured as a percentage of global cumulative consumption-based emissions." + description_key: + - *consumption_emissions_description_key + - *international_aviation_description_key + cumulative_emissions_from_cement: + title: "Cumulative CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured in tonnes." + cumulative_emissions_from_cement_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from cement" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from cement since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from cement." + cumulative_emissions_from_coal: + title: "Cumulative CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured in tonnes." + cumulative_emissions_from_coal_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from coal" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from coal since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from coal." + cumulative_emissions_from_flaring: + title: "Cumulative CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured in tonnes." + cumulative_emissions_from_flaring_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from flaring" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from flaring since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from flaring." + cumulative_emissions_from_gas: + title: "Cumulative CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured in tonnes." + cumulative_emissions_from_gas_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from gas" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from gas since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from gas." + cumulative_emissions_from_land_use_change: + title: "Cumulative CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured in tonnes." + cumulative_emissions_from_land_use_change_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from land-use change" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from land-use change since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from land-use change." + cumulative_emissions_from_oil: + title: "Cumulative CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured in tonnes." + cumulative_emissions_from_oil_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from oil" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from oil since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from oil." + cumulative_emissions_from_other_industry: + title: "Cumulative CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured in tonnes." + cumulative_emissions_from_other_industry_as_share_of_global: + title: "Share of global cumulative CO₂ emissions from other industry" + unit: "%" + short_unit: "%" + description_short: "Cumulative emissions of carbon dioxide (CO₂) from other industry sources since the first year of available data, measured as a percentage of global cumulative emissions of CO₂ from other industry sources." + cumulative_emissions_total: + title: "Cumulative CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured in tonnes." + cumulative_emissions_total_as_share_of_global: + title: "Share of global cumulative CO₂ emissions" + unit: "%" + short_unit: "%" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), excluding land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂." + cumulative_emissions_total_including_land_use_change: + title: "Cumulative CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured in tonnes." + cumulative_emissions_total_including_land_use_change_as_share_of_global: + title: "Share of global cumulative CO₂ emissions including land-use change" + unit: "%" + short_unit: "%" + description_short: "Total cumulative emissions of carbon dioxide (CO₂), including land-use change, since the first year of available data, measured as a percentage of global total cumulative emissions of CO₂ (including land-use change)." + emissions_from_cement: + title: "Annual CO₂ emissions from cement" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes." + emissions_from_cement_as_share_of_global: + title: "Share of global annual CO₂ emissions from cement" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured as a percentage of global emissions of CO₂ from cement in the same year." + emissions_from_cement_per_capita: + title: "Annual CO₂ emissions from cement (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from cement, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_coal: + title: "Annual CO₂ emissions from coal" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes." + emissions_from_coal_as_share_of_global: + title: "Share of global annual CO₂ emissions from coal" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured as a percentage of global emissions of CO₂ from coal in the same year." + emissions_from_coal_per_capita: + title: "Annual CO₂ emissions from coal (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from coal, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_flaring: + title: "Annual CO₂ emissions from flaring" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes." + emissions_from_flaring_as_share_of_global: + title: "Share of global annual CO₂ emissions from flaring" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured as a percentage of global emissions of CO₂ from flaring in the same year." + emissions_from_flaring_per_capita: + title: "Annual CO₂ emissions from flaring (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from flaring, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_gas: + title: "Annual CO₂ emissions from gas" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes." + emissions_from_gas_as_share_of_global: + title: "Share of global annual CO₂ emissions from gas" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured as a percentage of global emissions of CO₂ from gas in the same year." + emissions_from_gas_per_capita: + title: "Annual CO₂ emissions from gas (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from gas, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_land_use_change: + title: "Annual CO₂ emissions from land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes." + emissions_from_land_use_change_as_share_of_global: + title: "Share of global annual CO₂ emissions from land-use change" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured as a percentage of global emissions of CO₂ from land-use change in the same year." + emissions_from_land_use_change_per_capita: + title: "Annual CO₂ emissions from land-use change per capita" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_oil: + title: "Annual CO₂ emissions from oil" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes." + emissions_from_oil_as_share_of_global: + title: "Share of global annual CO₂ emissions from oil" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured as a percentage of global emissions of CO₂ from oil in the same year." + emissions_from_oil_per_capita: + title: "Annual CO₂ emissions from oil (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from oil, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_from_other_industry: + title: "Annual CO₂ emissions from other industry" + unit: "tonnes" + short_unit: "t" + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes." + emissions_from_other_industry_as_share_of_global: + title: "Share of global annual CO₂ emissions from other industry" + unit: "%" + short_unit: "%" + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured as a percentage of global emissions of CO₂ from other industry sources in the same year." + emissions_from_other_industry_per_capita: + title: "Annual CO₂ emissions from other industry (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂) from other industry sources, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total: + title: "Annual CO₂ emissions" + unit: "tonnes" + short_unit: "t" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." + emissions_total_as_share_of_global: + title: "Share of global annual CO₂ emissions" + unit: "%" + short_unit: "%" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured as a percentage of global emissions of CO₂ in the same year." + emissions_total_including_land_use_change: + title: "Annual CO₂ emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." + emissions_total_including_land_use_change_as_share_of_global: + title: "Share of global annual CO₂ emissions including land-use change" + unit: "%" + short_unit: "%" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured as a percentage of global total emissions of CO₂ in the same year." + emissions_total_including_land_use_change_per_capita: + title: "Annual CO₂ emissions including land-use change per capita" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total_including_land_use_change_per_gdp: + title: "Annual CO₂ emissions including land-use change per GDP" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per dollar of GDP (2011 international-$)." + emissions_total_including_land_use_change_per_unit_energy: + title: "Annual CO₂ emissions including land-use change per unit energy" + unit: "kilograms per kilowatt-hour" + short_unit: "kg/kWh" + description_short: "Annual total emissions of carbon dioxide (CO₂), including land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." + emissions_total_per_capita: + title: "Annual CO₂ emissions (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *production_emissions_description_key + - *international_aviation_description_key + emissions_total_per_gdp: + title: "Annual CO₂ emissions per GDP (kg per international-$)" + unit: "kilograms per international-$" + short_unit: "kg/$" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per dollar of GDP (2011 international-$)." + emissions_total_per_unit_energy: + title: "Annual CO₂ emissions per unit energy (kg per kilowatt-hour)" + unit: "kilograms per kilowatt-hour" + short_unit: "kg/kWh" + description_short: "Annual total emissions of carbon dioxide (CO₂), excluding land-use change, measured in kilograms per kilowatt-hour of primary energy consumption." + gdp: + title: "GDP" + unit: "2011 international-$" + short_unit: "$" + description_short: >- + Gross domestic product measured in international-$ using 2011 prices to adjust for price changes over time (inflation) + and price differences between countries. + growth_emissions_total: + title: "Annual CO₂ emissions growth (abs)" + unit: "tonnes" + short_unit: "t" + description_short: "Annual growth in total emissions of carbon dioxide (CO₂), excluding land-use change, measured in tonnes." + growth_emissions_total_including_land_use_change: + title: "Growth rate of emissions including land-use change" + unit: "tonnes" + short_unit: "t" + description_short: "Annual growth in total emissions of carbon dioxide (CO₂), including land-use change, measured in tonnes." + pct_growth_emissions_total: + title: "Annual CO₂ emissions growth (%)" + unit: "%" + short_unit: "%" + description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), excluding land-use change." + pct_growth_emissions_total_including_land_use_change: + title: "Growth rate of emissions including land-use change (%)" + unit: "%" + short_unit: "%" + description_short: "Annual percentage growth in total emissions of carbon dioxide (CO₂), including land-use change." + pct_traded_emissions: + title: "Share of annual CO₂ emissions embedded in trade" + unit: "%" + short_unit: "%" + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured as a percentage of emissions of CO₂." + description_key: + - *traded_emissions_description_key + - *international_aviation_description_key + population: + title: "Population" + unit: "persons" + short_unit: "persons" + population_as_share_of_global: + title: "Share of population" + unit: "%" + short_unit: "%" + description_short: "Population, measured as a percentage of global total population in the same year." + primary_energy_consumption: + title: "Primary energy consumption" + unit: "terawatt-hours" + short_unit: "TWh" + description_short: "Primary energy consumption, measured in terawatt-hours per year." + traded_emissions: + title: "Annual CO₂ emissions embedded in trade" + unit: "tonnes" + short_unit: "t" + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes." + description_key: + - *traded_emissions_description_key + - *international_aviation_description_key + traded_emissions_per_capita: + title: "Annual CO₂ emissions embedded in trade (per capita)" + unit: "tonnes per person" + short_unit: "t/person" + display: + shortUnit: t + description_short: "Annual net carbon dioxide (CO₂) emissions embedded in trade, measured in tonnes per person." + description_key: + - *per_capita_description_key + - *traded_emissions_description_key + - *international_aviation_description_key diff --git a/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.py b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.py new file mode 100644 index 00000000000..e453ba09dff --- /dev/null +++ b/etl/steps/data/garden/gcp/2024-11-21/global_carbon_budget.py @@ -0,0 +1,1142 @@ +"""This step creates the Global Carbon Budget (GCB) dataset, by the Global Carbon Project (GCP). + +It harmonizes and further processes meadow data, and uses the following auxiliary datasets: +- GGDC's Maddison dataset on GDP, used to calculate emissions per GDP. +- Primary Energy Consumption (mix of sources from the 'energy' namespace) to calculate emissions per unit energy. +- Population (mix of sources), to calculate emissions per capita. +- Regions (mix of sources), to generate aggregates for different continents. +- WorldBank's Income groups, to generate aggregates for different income groups. + +""" +import numpy as np +import owid.catalog.processing as pr +from owid.catalog import Dataset, Table +from owid.datautils import dataframes +from structlog import get_logger + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Expected outliers in consumption-based emissions (with negative emissions in the original data, that will be removed). +# NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. +OUTLIERS_IN_CONSUMPTION_DF = [ + ("Panama", 2003), + ("Panama", 2004), + ("Panama", 2005), + ("Panama", 2006), + ("Panama", 2012), + ("Panama", 2013), + ("Venezuela", 2018), +] + +# Regions and income groups to create by aggregating contributions from member countries. +# In the following dictionary, if nothing is stated, the region is supposed to be a default continent/income group. +# Otherwise, the dictionary can have "regions_included", "regions_excluded", "countries_included", and +# "countries_excluded". The aggregates will be calculated on the resulting countries. +REGIONS = { + # Default continents. + "Africa": {}, + "Asia": {}, + "Europe": {}, + # We exclude GCB's EU27 data, because it appears only in a few metrics, and, when it exists, it is identical to our + # aggregated European Union (27). + "European Union (27)": {}, + "North America": {}, + "Oceania": {}, + "South America": {}, + # Income groups. + "Low-income countries": {}, + "Upper-middle-income countries": {}, + "Lower-middle-income countries": {}, + "High-income countries": {}, + # Additional composite regions. + "Asia (excl. China and India)": { + "additional_regions": ["Asia"], + "excluded_members": ["China", "India"], + }, + "Europe (excl. EU-27)": {"additional_regions": ["Europe"], "excluded_regions": ["European Union (27)"]}, + "Europe (excl. EU-28)": { + "additional_regions": ["Europe"], + "excluded_regions": ["European Union (27)"], + "excluded_members": ["United Kingdom"], + }, + "European Union (28)": { + "additional_regions": ["European Union (27)"], + "additional_members": ["United Kingdom"], + }, + "North America (excl. USA)": { + "additional_regions": ["North America"], + "excluded_members": ["United States"], + }, +} + +# Columns to use from GCB fossil CO2 emissions data and how to rename them. +CO2_COLUMNS = { + "country": "country", + "year": "year", + "cement": "emissions_from_cement", + "coal": "emissions_from_coal", + "flaring": "emissions_from_flaring", + "gas": "emissions_from_gas", + "oil": "emissions_from_oil", + "other": "emissions_from_other_industry", + "total": "emissions_total", +} + +# List all sources of emissions considered. +EMISSION_SOURCES = [column for column in CO2_COLUMNS.values() if column not in ["country", "year"]] + +# Columns to use from primary energy consumption data and how to rename them. +PRIMARY_ENERGY_COLUMNS = { + "country": "country", + "year": "year", + "primary_energy_consumption__twh": "primary_energy_consumption", +} + +# Columns to use from historical emissions data and how to rename them. +HISTORICAL_EMISSIONS_COLUMNS = { + "country": "country", + "year": "year", + # Global fossil emissions are used only for sanity checks. + "global_fossil_emissions": "global_fossil_emissions", + "global_land_use_change_emissions": "global_emissions_from_land_use_change", +} + +# Columns to use from consumption-based emissions data and how to rename them. +CONSUMPTION_EMISSIONS_COLUMNS = { + "country": "country", + "year": "year", + "consumption_emissions": "consumption_emissions", +} + +# Conversion from terawatt-hours to kilowatt-hours. +TWH_TO_KWH = 1e9 + +# Conversion factor to change from billion tonnes of carbon to tonnes of CO2. +BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e9 + +# Conversion factor to change from million tonnes of carbon to tonnes of CO2. +MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 = 3.664 * 1e6 + +# Conversion from million tonnes of CO2 to tonnes of CO2. +MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 = 1e6 + +# Conversion from tonnes of CO2 to kg of CO2 (used for emissions per GDP and per unit energy). +TONNES_OF_CO2_TO_KG_OF_CO2 = 1000 + +# In order to remove uninformative columns, keep only rows where at least one of the following columns has data. +# All other columns are either derived variables, or global variables, or auxiliary variables from other datasets. +COLUMNS_THAT_MUST_HAVE_DATA = [ + "emissions_from_cement", + "emissions_from_coal", + "emissions_from_flaring", + "emissions_from_gas", + "emissions_from_oil", + "emissions_from_other_industry", + "emissions_total", + "consumption_emissions", + "emissions_from_land_use_change", +] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read all its tables. + ds_meadow = paths.load_dataset("global_carbon_budget") + tb_co2 = ds_meadow.read("global_carbon_budget_fossil_co2_emissions", safe_types=False) + tb_historical = ds_meadow.read("global_carbon_budget_historical_budget", safe_types=False) + tb_consumption = ds_meadow.read("global_carbon_budget_consumption_emissions", safe_types=False) + tb_production = ds_meadow.read("global_carbon_budget_production_emissions", safe_types=False) + tb_land_use = ds_meadow.read("global_carbon_budget_land_use_change", safe_types=False) + + # Load primary energy consumption dataset and read its main table. + ds_energy = paths.load_dataset("primary_energy_consumption") + tb_energy = ds_energy["primary_energy_consumption"].reset_index() + + # Load GDP dataset. + ds_gdp = paths.load_dataset("maddison_project_database") + + # Load population dataset. + ds_population = paths.load_dataset("population") + + # Load regions dataset. + ds_regions = paths.load_dataset("regions") + + # Load income groups dataset. + ds_income_groups = paths.load_dataset("income_groups") + + # + # Process data. + # + # Prepare fossil CO2 emissions data. + tb_co2 = prepare_fossil_co2_emissions(tb_co2=tb_co2) + + # Prepare consumption-based emission data. + tb_consumption = prepare_consumption_emissions(tb_consumption=tb_consumption) + + # Prepare production-based emission data. + tb_production = prepare_production_emissions(tb_production=tb_production) + + # Prepare land-use emission data. + tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) + + # Select and rename columns from primary energy data. + tb_energy = tb_energy[list(PRIMARY_ENERGY_COLUMNS)].rename(columns=PRIMARY_ENERGY_COLUMNS, errors="raise") + + # Prepare historical emissions data. + tb_historical = prepare_historical_emissions(tb_historical=tb_historical) + + # Run sanity checks on input data. + sanity_checks_on_input_data( + tb_production=tb_production, tb_consumption=tb_consumption, tb_historical=tb_historical, tb_co2=tb_co2 + ) + + # Extract global emissions, including bunker and land-use change emissions. + tb_global_emissions = extract_global_emissions( + tb_co2=tb_co2, tb_historical=tb_historical, ds_population=ds_population + ) + + # Harmonize country names. + tb_co2 = harmonize_country_names(tb=tb_co2) + tb_consumption = harmonize_country_names(tb=tb_consumption) + tb_production = harmonize_country_names(tb=tb_production) + tb_land_use = harmonize_country_names(tb=tb_land_use) + + # Fix duplicated rows for Palau. + tb_co2 = fix_duplicated_palau_data(tb_co2=tb_co2) + + # Add new variables to main table (consumption-based emissions, emission intensity, per-capita emissions, etc.). + tb_combined = combine_data_and_add_variables( + tb_co2=tb_co2, + tb_production=tb_production, + tb_consumption=tb_consumption, + tb_global_emissions=tb_global_emissions, + tb_land_use=tb_land_use, + tb_energy=tb_energy, + ds_gdp=ds_gdp, + ds_population=ds_population, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + ) + + #################################################################################################################### + # The data for emissions from other industry is quite sparse. + # This causes the share of emissions to have spurious jumps (because during some years only a few countries are informed). You can easily see these jumps for China and US. From 1990 on, more countries are informed, and therefore the data is more reliable. So I will set the share of emissions from other industry to None for years before 1990. + tb_combined.loc[(tb_combined["year"] < 1990), "emissions_from_other_industry_as_share_of_global"] = None + tb_combined.loc[(tb_combined["year"] < 1990), "cumulative_emissions_from_other_industry_as_share_of_global"] = None + #################################################################################################################### + + # Set an appropriate index, ensure there are no rows that only have nan, and sort conveniently. + tb_combined = tb_combined.format(sort_columns=True, short_name=paths.short_name) + + # Run sanity checks on output data. + sanity_checks_on_output_data(tb_combined) + + # + # Save outputs. + # + # Create a new garden dataset and use metadata from meadow dataset. + ds_garden = create_dataset( + dest_dir=dest_dir, tables=[tb_combined], default_metadata=ds_meadow.metadata, check_variables_metadata=True + ) + ds_garden.save() + + +def sanity_checks_on_input_data( + tb_production: Table, tb_consumption: Table, tb_historical: Table, tb_co2: Table +) -> None: + """Run sanity checks on input data files. + + These checks should be used prior to country harmonization, but after basic processing of the tables. + + Parameters + ---------- + tb_production : Table + Production-based emissions from GCP's official national emissions dataset (excel file). + tb_consumption : Table + Consumption-based emissions from GCP's official national emissions dataset (excel file). + tb_historical : Table + Historical emissions from GCP's official global emissions dataset (excel file). + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). + + """ + tb_production = tb_production.copy() + tb_consumption = tb_consumption.copy() + tb_historical = tb_historical.copy() + tb_co2 = tb_co2.copy() + + # In the original data, Bunkers was included in the national data file, as another country. + # But I suppose it should be considered as another kind of global emission. + # In fact, bunker emissions should coincide for production and consumption emissions. + global_bunkers_emissions = ( + tb_production[tb_production["country"] == "Bunkers"][["year", "production_emissions"]] + .reset_index(drop=True) + .rename(columns={"production_emissions": "global_bunker_emissions"}, errors="raise") + ) + + # Check that we get exactly the same array of bunker emissions from the consumption emissions table + # (on years where there is data for bunker emissions in both datasets). + comparison = pr.merge( + global_bunkers_emissions, + tb_consumption[tb_consumption["country"] == "Bunkers"][["year", "consumption_emissions"]] + .reset_index(drop=True) + .rename(columns={"consumption_emissions": "global_bunker_emissions"}, errors="raise"), + how="inner", + on="year", + suffixes=("", "_check"), + ) + + error = "Bunker emissions were expected to coincide in production and consumption emissions tables." + assert (comparison["global_bunker_emissions"] == comparison["global_bunker_emissions_check"]).all(), error + + # Check that all production-based emissions are positive. + error = "There are negative emissions in tb_production (from the additional variables dataset)." + assert (tb_production.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that all production-based emissions from the fossil CO2 dataset are positive. + error = "There are negative emissions in tb_co2 (from the fossil CO2 dataset)." + assert (tb_co2.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that all consumption-based emissions are positive. + error = "There are negative emissions in tb_consumption (from the national emissions dataset)." + assert (tb_consumption.drop(columns=["country", "year"]).fillna(0) >= 0).all().all(), error + + # Check that, for the World, production emissions coincides with consumption emissions (on common years). + error = "Production and consumption emissions for the world were expected to be identical." + comparison = pr.merge( + tb_production[tb_production["country"] == "World"].reset_index(drop=True), + tb_consumption[tb_consumption["country"] == "World"].reset_index(drop=True), + how="inner", + on="year", + ) + assert (comparison["production_emissions"] == comparison["consumption_emissions"]).all(), error + + # Check that production emissions for the World coincide with global (historical) emissions (on common years). + comparison = pr.merge( + tb_production[tb_production["country"] == "World"][["year", "production_emissions"]].reset_index(drop=True), + tb_historical[["year", "global_fossil_emissions"]], + how="inner", + on="year", + ) + error = "Production emissions for the world were expected to coincide with global fossil emissions." + assert ( + 100 + * abs(comparison["production_emissions"] - comparison["global_fossil_emissions"]) + / (comparison["global_fossil_emissions"]) + < 0.0001 + ).all(), error + + # In the Fossil CO2 file, international transport emissions has been separated into aviation and shipping. + # Emissions are also separated by fuel. + # We'll add them to the global emissions. + global_aviation_and_shipping = ( + tb_co2[tb_co2["country"].isin(["International Aviation", "International Shipping"])] + .dropna() + .pivot(index="year", columns="country", values="emissions_total") + .reset_index() + ) + global_aviation_and_shipping["global_aviation_and_shipping"] = ( + global_aviation_and_shipping["International Aviation"] + global_aviation_and_shipping["International Shipping"] + ) + comparison = ( + tb_production[tb_production["country"] == "Bunkers"] + .reset_index(drop=True) + .rename(columns={"production_emissions": "global_bunker_emissions"}) + .merge( + global_aviation_and_shipping[["year", "global_aviation_and_shipping"]], + how="outer", + on="year", + ) + .sort_values("year") + .reset_index(drop=True) + ) + # Keep only rows where both time series are informed. + comparison = comparison.dropna( + subset=["global_bunker_emissions", "global_aviation_and_shipping"], how="any" + ).reset_index(drop=True) + error = ( + "Bunker emissions from national emissions file should coincide (within 0.0001%) with the sum of aviation" + " and shipping emissions from the Fossil CO2 file." + ) + assert ( + 100 + * abs(comparison["global_bunker_emissions"] - comparison["global_aviation_and_shipping"]) + / (comparison["global_bunker_emissions"]) + < 0.0001 + ).all(), error + + # Now check that all other emissions (that are not from bunker fuels) in tb_production (emissions from the national + # excel file) coincide with emissions in tb_co2 (from the Fossil CO2 emissions csv file). + # Since country names have not yet been harmonized, rename the only countries that are present in both datasets. + comparison = pr.merge( + tb_co2[["country", "year", "emissions_total"]], + tb_production[tb_production["country"] != "Bunkers"].astype({"country": str}).replace({"World": "Global"}), + on=["country", "year"], + how="inner", + ).dropna(subset=["emissions_total", "production_emissions"], how="any") + # Since we included the emissions from the Kuwaiti oil fires in Kuwait (and they are not included in tb_production), + # omit that row in the comparison. + comparison = comparison.drop( + comparison[(comparison["country"] == "Kuwait") & (comparison["year"] == 1991)].index + ).reset_index(drop=True) + # Check that production emissions from national file coincide with the Fossil CO2 emissions dataset. + # NOTE: It seems that total emissions may have been rounded to zero decimals, which is why in the following assertion I also round production emissions. + error = "Production emissions from national file were expected to coincide with the Fossil CO2 emissions dataset." + assert ( + ( + 100 + * abs(comparison["production_emissions"].round(0) - comparison["emissions_total"]) + / (comparison["emissions_total"]) + ).fillna(0) + < 0.01 + ).all(), error + + +def sanity_checks_on_output_data(tb_combined: Table) -> None: + """Run sanity checks on output data. + + These checks should be run on the very final output table (with an index) prior to storing it as a table. + + Parameters + ---------- + tb_combined : Table + Combination of all input tables, after processing, harmonization, and addition of variables. + + """ + tb_combined = tb_combined.reset_index() + error = "All variables (except traded emissions, growth, and land-use change) should be >= 0 or nan." + positive_variables = [ + col + for col in tb_combined.columns + if col != "country" + if "traded" not in col + if "growth" not in col + if "land_use" not in col + ] + assert (tb_combined[positive_variables].fillna(0) >= 0).all().all(), error + + error = "Production emissions as a share of global emissions should be 100% for 'World'." + assert tb_combined[ + (tb_combined["country"] == "World") & (abs(tb_combined["emissions_total_as_share_of_global"] - 100) > 0.00001) + ].empty, error + + error = "Consumption emissions as a share of global emissions should be 100% for 'World'." + assert tb_combined[ + (tb_combined["country"] == "World") + & (abs(tb_combined["consumption_emissions_as_share_of_global"] - 100) > 0.0001) + ].empty, error + + error = "Population as a share of global population should be 100% for 'World'." + assert tb_combined[ + (tb_combined["country"] == "World") & (tb_combined["population_as_share_of_global"].fillna(100) != 100) + ].empty, error + + error = "All share of global emissions should be smaller than 100%." + share_variables = [ + col + for col in tb_combined.columns + if "share" in col + if col != "emissions_from_land_use_change_as_share_of_global" + ] + assert (tb_combined[share_variables].fillna(0) <= 100.001).all().all(), error + # NOTE: In previous versions, "emissions_from_land_use_change_as_share_of_global" was >101%, e.g. from Upper-middle-income countries in 1982, 1984 and 1986. This is, in principle, possible (since land use change emissions can be negative). + assert (tb_combined["emissions_from_land_use_change_as_share_of_global"].fillna(0) <= 100.00001).all().all(), error + + # Check that cumulative variables are monotonically increasing. + # Firstly, list columns of cumulative variables, but ignoring cumulative columns as a share of global + # (since they are not necessarily monotonic) and land-use change (which can be negative). + cumulative_cols = [ + col for col in tb_combined.columns if "cumulative" in col if "share" not in col if "land_use" not in col + ] + # Using ".is_monotonic_increasing" can fail when differences between consecutive numbers are very small. + # Instead, sort data backwards in time, and check that consecutive values of cumulative variables always have + # a percentage change that is smaller than a certain amount. + error = ( + "Cumulative variables (not given as a share of global) should be monotonically increasing (except when " + "including land-use change emissions, which can be negative)." + ) + assert ( + tb_combined.sort_values("year", ascending=False) + .groupby("country") + .agg( + { + col: lambda x: ((x.pct_change(fill_method=None).dropna() * 100) <= 0.0001).all() + for col in cumulative_cols + } + ) + .all() + .all() + ), error + + error = ( + "Production emissions as a share of global production emissions for the World should always be 100% " + "(or larger than 98%, given small discrepancies)." + ) + # Consumption emissions as a share of global production emissions is allowed to be smaller than 100%. + share_variables = [col for col in tb_combined.columns if "share" in col if "consumption" not in col] + assert (tb_combined[tb_combined["country"] == "World"][share_variables].fillna(100) > 99.9999).all().all(), error + + error = "Traded emissions for the World should be close to zero." + world_mask = tb_combined["country"] == "World" + assert ( + abs( + 100 + * tb_combined[world_mask]["traded_emissions"].fillna(0) + / tb_combined[world_mask]["emissions_total"].fillna(1) + ) + < 0.0001 + ).all(), error + + +def prepare_fossil_co2_emissions(tb_co2: Table) -> Table: + """Prepare Fossil CO2 emissions data (basic processing).""" + # Select and rename columns from fossil CO2 data. + tb_co2 = tb_co2[list(CO2_COLUMNS)].rename(columns=CO2_COLUMNS, errors="raise") + + # Ensure all emissions are given in tonnes of CO2. + tb_co2[EMISSION_SOURCES] *= MILLION_TONNES_OF_CO2_TO_TONNES_OF_CO2 + + #################################################################################################################### + # For certain years, column "emissions_from_other_industry" is not informed for "World" but it is informed + # for some countries (namely China and US). + # Note that this is not necessarily an issue in the original data: The data provider may have decided that it is + # better to leave the world uninformed where not enough countries are informed. + # However, "emissions_total" for the World seems to include those contributions from China and the US. + # This can be easily checked in the original data by selecting the year 1989 (last year for which there is data for + # China and US, but not for the World). The sum of emissions from all sources (namely coal, oil, gas, cement, and + # flaring, given that "other" is empty) does not add up to "emissions_total". But, if one includes the other + # emissions from China and US, then it does add up. + # This inconsistency causes the cumulative emissions from other industry for China and US to be larger than the + # global cumulative emissions. And the share of global emissions for those countries becomes hence larger than 100%. + # To fix this issue, we aggregate the data for China and US on those years when the world's data is missing (without + # touching other years or other columns), and add that data to the global emissions from other industry. + # NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. + + # Firstly, list of years for which the world has no data for emissions_from_other_industry. + world_missing_years = ( + tb_co2[(tb_co2["country"] == "Global") & (tb_co2["emissions_from_other_industry"].isnull())]["year"] + .unique() + .tolist() # type: ignore + ) + # Data that needs to be aggregated. + data_missing_in_world = tb_co2[ + tb_co2["year"].isin(world_missing_years) & (tb_co2["emissions_from_other_industry"].notnull()) + ] + # Check that there is indeed data to be aggregated (that is missing for the World). + error = ( + "Expected emissions_from_other_industry to be null for the world but not null for certain countries " + "(which was an issue in the original fossil CO2 data). The issue may be fixed and the code can be simplified." + ) + assert len(data_missing_in_world) > 0, error + # Create a table of aggregate data for the World, on those years when it's missing. + aggregated_missing_data = ( + data_missing_in_world.groupby("year") + .agg({"emissions_from_other_industry": "sum"}) + .reset_index() + .assign(**{"country": "Global"}) + ) + # Combine the new table of aggregate data with the main table. + tb_co2 = dataframes.combine_two_overlapping_dataframes( + df1=tb_co2, df2=aggregated_missing_data, index_columns=["country", "year"], keep_column_order=True + ) + # NOTE: The previous function currently does not properly propagate metadata, but keeps only the sources of the + # first table. But given that both tables combined have the same source, we don't need to manually change it. + #################################################################################################################### + + # We add the emissions from "Kuwaiti Oil Fires" (which is also included as a separate country) as part of the + # emissions of Kuwait. This ensures that they will be included in region aggregates. + error = "'Kuwaiti Oil Fires' was expected to only have not-null data for 1991." + assert tb_co2[ + (tb_co2["country"] == "Kuwaiti Oil Fires") + & (tb_co2["emissions_total"].notnull()) + & (tb_co2["emissions_total"] != 0) + ]["year"].tolist() == [1991], error + + tb_co2.loc[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991), EMISSION_SOURCES] = ( + tb_co2[(tb_co2["country"] == "Kuwaiti Oil Fires") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values + + tb_co2[(tb_co2["country"] == "Kuwait") & (tb_co2["year"] == 1991)][EMISSION_SOURCES].values + ) + + # Check that "emissions_total" agrees with the sum of emissions from individual sources. + error = "The sum of all emissions should add up to total emissions (within 1%)." + assert ( + abs( + tb_co2.drop(columns=["country", "year", "emissions_total"]).sum(axis=1) + - tb_co2["emissions_total"].fillna(0) + ) + / (tb_co2["emissions_total"].fillna(0) + 1e-7) + < 1e-2 + ).all(), error + + # Many rows have zero total emissions, but actually the individual sources are nan. + # Total emissions in those cases should be nan, instead of zero. + no_individual_emissions = tb_co2.drop(columns=["country", "year", "emissions_total"]).isnull().all(axis=1) + tb_co2.loc[no_individual_emissions, "emissions_total"] = np.nan + + return tb_co2 + + +def prepare_consumption_emissions(tb_consumption: Table) -> Table: + """Prepare consumption-based emissions data (basic processing).""" + # Select and rename columns. + tb_consumption = tb_consumption[list(CONSUMPTION_EMISSIONS_COLUMNS)].rename( + columns=CONSUMPTION_EMISSIONS_COLUMNS, errors="raise" + ) + + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_consumption.drop(columns=["country", "year"]).columns: + tb_consumption[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + # List indexes of rows in tb_consumption corresponding to outliers (defined above in OUTLIERS_IN_tb_consumption). + outlier_indexes = [ + tb_consumption[(tb_consumption["country"] == outlier[0]) & (tb_consumption["year"] == outlier[1])].index.item() + for outlier in OUTLIERS_IN_CONSUMPTION_DF + ] + + error = ( + "Outliers were expected to have negative consumption emissions. " + "Maybe outliers have been fixed (and should be removed from the code)." + ) + assert (tb_consumption.loc[outlier_indexes]["consumption_emissions"] < 0).all(), error + + # Remove outliers. + tb_consumption = tb_consumption.drop(outlier_indexes).reset_index(drop=True) + + return tb_consumption + + +def prepare_production_emissions(tb_production: Table) -> Table: + """Prepare production-based emissions data (basic processing).""" + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_production.drop(columns=["country", "year"]).columns: + tb_production[column] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return tb_production + + +def prepare_land_use_emissions(tb_land_use: Table) -> Table: + """Prepare land-use change emissions data (basic processing).""" + # Convert units from megatonnes of carbon per year emissions to tonnes of CO2 per year. + tb_land_use["emissions"] *= MILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + # There are two additional regions in the land-use change file, namely Global and EU27. + # It makes sense to extract national land-use change contributions from one of the sheets of that file (we currently + # do so from the "BLUE" sheet), since there are no other national land-use change emissions in other files. + # But for global emissions, it makes more sense to take the ones estimated by GCP, which are given in the + # "Historical Budget" sheet of the global emissions file. + # So, remove the data for "Global". + # We also remove EU27 data, as explained above, since we aggregate that data ourselves. + tb_land_use = tb_land_use[~tb_land_use["country"].isin(["Global", "EU27"])].reset_index(drop=True) + + return tb_land_use + + +def prepare_historical_emissions(tb_historical: Table) -> Table: + """Prepare historical emissions data.""" + # Select and rename columns from historical emissions data. + tb_historical = tb_historical[list(HISTORICAL_EMISSIONS_COLUMNS)].rename( + columns=HISTORICAL_EMISSIONS_COLUMNS, errors="raise" + ) + + # Convert units from gigatonnes of carbon per year emissions to tonnes of CO2 per year. + for column in tb_historical.drop(columns=["country", "year"]).columns: + tb_historical[column] *= BILLION_TONNES_OF_CARBON_TO_TONNES_OF_CO2 + + return tb_historical + + +def extract_global_emissions(tb_co2: Table, tb_historical: Table, ds_population: Dataset) -> Table: + """Extract World emissions by combining data from the Fossil CO2 emissions and the global emissions dataset. + + The resulting global emissions data includes bunker and land-use change emissions. + + NOTE: This function has to be used after selecting and renaming columns in tb_co2, but before harmonizing country + names in tb_co2 (so that "International Aviation" and "International Shipping" are still listed as countries). + + Parameters + ---------- + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file). + tb_historical : Table + Historical emissions from GCP's official global emissions dataset (excel file). + ds_population : Dataset + Population dataset. + + Returns + ------- + global_emissions : Table + World emissions. + + """ + # "International Aviation" and "International Shipping" are now included as separate countries. + # Combine their emissions into one variable. + global_aviation = ( + tb_co2[tb_co2["country"] == "International Aviation"].set_index(["year"]).drop(columns=["country"]) + ) + global_shipping = ( + tb_co2[tb_co2["country"] == "International Shipping"].set_index(["year"]).drop(columns=["country"]) + ) + global_transport = global_aviation + global_shipping + + # Check that total emissions for international aviation coincide with oil emissions. + # NOTE: International shipping does include emissions from gas, coal and oil. + error = "Total emissions from international aviation do not coincide with oil emissions." + assert all((global_aviation["emissions_from_oil"] - global_aviation["emissions_total"]).dropna() == 0), error + + # Keep only total emissions from international transport. + global_transport = ( + global_transport[["emissions_total"]] + .rename(columns={"emissions_total": "global_emissions_from_international_transport"}, errors="raise") + .dropna() + .reset_index() + ) + + # Create a new table of global emissions. + global_emissions = ( + tb_co2[tb_co2["country"].isin(["Global", "World"])][["year"] + EMISSION_SOURCES] + .rename(columns={column: f"global_{column}" for column in EMISSION_SOURCES}, errors="raise") + .sort_values("year") + .reset_index(drop=True) + ) + + # Add bunker fuels to global emissions. + global_emissions = pr.merge(global_emissions, global_transport, on=["year"], how="outer") + + # Add historical land-use change emissions to table of global emissions. + global_emissions = pr.merge( + global_emissions, tb_historical[["year", "global_emissions_from_land_use_change"]], how="left", on="year" + ) + + # Add variable of total emissions including fossil fuels and land use change. + global_emissions["global_emissions_total_including_land_use_change"] = ( + global_emissions["global_emissions_total"] + global_emissions["global_emissions_from_land_use_change"] + ) + + # Calculate global cumulative emissions. + for column in EMISSION_SOURCES + ["emissions_from_land_use_change", "emissions_total_including_land_use_change"]: + global_emissions[f"global_cumulative_{column}"] = global_emissions[f"global_{column}"].cumsum() + + # Add a country column and add global population. + global_emissions["country"] = "World" + + # Add global population. + global_emissions = geo.add_population_to_table( + tb=global_emissions, ds_population=ds_population, population_col="global_population" + ) + + return global_emissions + + +def harmonize_country_names(tb: Table) -> Table: + """Harmonize country names, and fix known issues with certain regions. + + Parameters + ---------- + tb : Table + Emissions data (either from the fossil CO2, the production-based, consumption-based, or land-use emissions + datasets). + + Returns + ------- + tb : Table + Emissions data after harmonizing country names. + + """ + # Harmonize country names. + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + excluded_countries_file=paths.excluded_countries_path, + warn_on_missing_countries=True, + warn_on_unused_countries=False, + make_missing_countries_nan=False, + warn_on_unknown_excluded_countries=False, + ) + + return tb + + +def fix_duplicated_palau_data(tb_co2: Table) -> Table: + tb = tb_co2.copy() + # Check that there is only one data point for each country-year. + # In the fossil CO2 emissions data, after harmonization, "Pacific Islands (Palau)" is mapped to "Palau", and + # therefore there are rows with different data for the same country-year. + # However, "Pacific Islands (Palau)" have data until 1991, and "Palau" has data from 1992 onwards. + # NOTE: this is not an issue with the original data, and it's simply caused by our harmonization of names. + + # Check that duplicate rows are still there. + error = "Expected 'Palau' data to be duplicated. Remove temporary fix." + assert tb[tb.duplicated(subset=["country", "year"])]["country"].unique().tolist() == ["Palau"], error + + # Select rows corresponding to "Palau" prior to 1992, and to "Pacific Islands (Palau)" from 1992 onwards. + indexes_to_drop = ( + tb[ + (tb["country"] == "Palau") & (tb["year"] < 1992) & (tb.duplicated(subset=["country", "year"], keep="first")) + ].index.tolist() + + tb[ + (tb["country"] == "Palau") & (tb["year"] >= 1992) & (tb.duplicated(subset=["country", "year"], keep="last")) + ].index.tolist() + ) + # Check that the selected rows do not overlap. + assert len(indexes_to_drop) == len(set(indexes_to_drop)) + # Remove those rows. + tb = tb.drop(indexes_to_drop).reset_index(drop=True) + # NOTE: Do not drop empty rows yet, as they will be needed to have a complete population series. + + return tb + + +def fix_consumption_emissions_for_africa(tb_co2_with_regions: Table) -> Table: + # The calculated consumption emissions for Africa differ significantly from those in the GCP dataset. + # GCP's estimate is significantly larger. The reason may be that many African countries do not have data on + # consumption emissions, so the aggregate may be underestimated. Maybe GCP has a different way to estimate Africa's + # consumption emissions. + # We therefore replace our values for Africa (calculated by summing consumption emissions from African countries) + # with those from GCP. + # At the end of the day, the reason why we keep ours and GCP's version of continents is that our definitions may + # differ. But it is unlikely that their definition of the African continent is different from ours. + # NOTE: This issue has been reported to the data providers, and will hopefully be fixed in a coming version. + + # First, check that the discrepancy exists in the current data. + tb = tb_co2_with_regions.copy() + consumption_emissions_africa = tb[(tb["country"] == "Africa") & (tb["year"] == 2020)][ + "consumption_emissions" + ].item() + consumption_emissions_africa_gcp = tb[(tb["country"] == "Africa (GCP)") & (tb["year"] == 2020)][ + "consumption_emissions" + ].item() + error = ( + "Discrepancy in consumption emissions between aggregated Africa and Africa (GCP) no longer exists. " + "Remove temporary fix" + ) + assert ( + consumption_emissions_africa_gcp - consumption_emissions_africa + ) / consumption_emissions_africa_gcp > 0.23, error + + # Replace consumption emissions for "Africa" by those by "Africa (GCP)". + consumption_emissions = tb[tb["country"] != "Africa"][["country", "year", "consumption_emissions"]].reset_index( + drop=True + ) + consumption_emissions_for_africa = ( + consumption_emissions[consumption_emissions["country"] == "Africa (GCP)"] + .reset_index(drop=True) + .replace({"Africa (GCP)": "Africa"}) + ) + consumption_emissions = pr.concat([consumption_emissions, consumption_emissions_for_africa], ignore_index=True) + # Replace consumption emissions in main table by the fixed one. + tb = tb.drop(columns="consumption_emissions").merge(consumption_emissions, on=["country", "year"], how="outer") + + # Sanity checks. + # All columns except consumption_emissions should be identical to the original. + error = "Mismatch before and after fixing consumption emissions for Africa." + for col in tb.drop(columns=["consumption_emissions"]).columns: + assert ( + tb[col].dropna().reset_index(drop=True) == tb_co2_with_regions[col].dropna().reset_index(drop=True) + ).all() + # Consumption emissions should be identical to the original except for Africa. + assert ( + tb[tb["country"] != "Africa"]["consumption_emissions"].dropna().reset_index(drop=True) + == tb_co2_with_regions[tb_co2_with_regions["country"] != "Africa"]["consumption_emissions"] + .dropna() + .reset_index(drop=True) + ).all() + + return tb + + +def combine_data_and_add_variables( + tb_co2: Table, + tb_production: Table, + tb_consumption: Table, + tb_global_emissions: Table, + tb_land_use: Table, + tb_energy: Table, + ds_gdp: Dataset, + ds_population: Table, + ds_regions: Dataset, + ds_income_groups: Dataset, +) -> Table: + """Combine all relevant data into one table, add region aggregates, and add custom variables (e.g. emissions per + capita). + + Parameters + ---------- + tb_co2 : Table + Production-based emissions from GCP's Fossil CO2 emissions dataset (csv file), after harmonization. + tb_production : Table + Production-based emissions from GCP's official national emissions dataset (excel file), after harmonization. + tb_consumption : Table + Consumption-based emissions from GCP's official national emissions dataset (excel file), after harmonization. + tb_global_emissions : Table + World emissions (including bunker and land-use change emissions). + tb_land_use : Table + National land-use change emissions from GCP's official dataset (excel file), after harmonization. + tb_energy : Table + Primary energy data. + ds_gdp : Dataset + GDP dataset. + ds_population : Dataset + Population dataset. + ds_regions : Dataset + Regions dataset. + ds_income_groups : Dataset + Income groups dataset. + + Returns + ------- + tb_co2_with_regions : Table + Combined data, with all additional variables and with region aggregates. + + """ + tb_co2_with_regions = tb_co2.copy() + + # Add region aggregates that were included in the national emissions file, but not in the Fossil CO2 emissions file. + gcp_aggregates = sorted(set(tb_production["country"]) - set(tb_co2_with_regions["country"])) + # NOTE: Here, "International transport" is included. This will cause that total emissions have both data for + # international aviation and shipping, and international transport (which is the sum of the former two). + # But international transport will be removed later, in columns when that happens. + tb_co2_with_regions = pr.concat( + [ + tb_co2_with_regions, + tb_production[tb_production["country"].isin(gcp_aggregates)] + .rename(columns={"production_emissions": "emissions_total"}) + .astype({"year": int}), + ], + ignore_index=True, + short_name=paths.short_name, + ).reset_index(drop=True) + + # Add consumption emissions to main table (keep only the countries of the main table). + # Given that additional GCP regions (e.g. "Africa (GCP)") have already been added to tb_co2 + # (when merging with tb_production), all countries from tb_consumption should be included in tb_co2. + error = "Some countries in tb_consumption are not included in tb_co2." + assert set(tb_consumption["country"]) < set(tb_co2_with_regions["country"]), error + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_consumption, on=["country", "year"], how="outer") + + # Add population to original table. + tb_co2_with_regions = geo.add_population_to_table( + tb=tb_co2_with_regions, ds_population=ds_population, warn_on_missing_countries=False + ) + + # Add GDP to main table. + tb_co2_with_regions = geo.add_gdp_to_table(tb=tb_co2_with_regions, ds_gdp=ds_gdp) + + # Add primary energy to main table. + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_energy, on=["country", "year"], how="left") + + # For convenience, rename columns in land-use change emissions data. + tb_land_use = tb_land_use.rename(columns={"emissions": "emissions_from_land_use_change"}) + + # Land-use change data does not include data for the World. Include it by merging with the global dataset. + tb_land_use = pr.concat( + [ + tb_land_use, + tb_global_emissions.rename( + columns={"global_emissions_from_land_use_change": "emissions_from_land_use_change"} + )[["year", "emissions_from_land_use_change"]] + .dropna() + .assign(**{"country": "World"}), + ], + ignore_index=True, + ).astype({"year": int}) + + # Add land-use change emissions to main table. + tb_co2_with_regions = pr.merge(tb_co2_with_regions, tb_land_use, on=["country", "year"], how="outer") + + # Add total emissions (including land-use change) for each country. + tb_co2_with_regions["emissions_total_including_land_use_change"] = ( + tb_co2_with_regions["emissions_total"] + tb_co2_with_regions["emissions_from_land_use_change"] + ) + + # Add region aggregates. + # Aggregate not only emissions data, but also population, gdp and primary energy. + # This way we ensure that custom regions (e.g. "North America (excl. USA)") will have all required data. + aggregations = {column: "sum" for column in tb_co2_with_regions.columns if column not in ["country", "year"]} + for region in REGIONS: + countries_in_region = geo.list_members_of_region( + region=region, + ds_regions=ds_regions, + ds_income_groups=ds_income_groups, + additional_regions=REGIONS[region].get("additional_regions", None), + excluded_regions=REGIONS[region].get("excluded_regions", None), + additional_members=REGIONS[region].get("additional_members", None), + excluded_members=REGIONS[region].get("excluded_members", None), + include_historical_regions_in_income_groups=True, + ) + tb_co2_with_regions = geo.add_region_aggregates( + df=tb_co2_with_regions, + region=region, + countries_in_region=countries_in_region, + countries_that_must_have_data=[], + frac_allowed_nans_per_year=0.999, + aggregations=aggregations, + ) + + # Fix consumption emissions for Africa. + tb_co2_with_regions = fix_consumption_emissions_for_africa(tb_co2_with_regions=tb_co2_with_regions) + + # Temporarily add global emissions and global cumulative emissions columns to main table, to be able to calculate + # indicators in terms of global emissions. + tb_co2_with_regions = pr.merge( + tb_co2_with_regions, tb_global_emissions.drop(columns="country"), on=["year"], how="left" + ) + + # Temporarily add certain global emissions variables. + # This is done simply to be able to consider "consumption_emissions" as just another type of emission + # when creating additional variables. + tb_co2_with_regions["global_consumption_emissions"] = tb_co2_with_regions["global_emissions_total"].copy() + tb_co2_with_regions["global_cumulative_consumption_emissions"] = tb_co2_with_regions[ + "global_cumulative_emissions_total" + ].copy() + + # Ensure main table is sorted (so that cumulative emissions are properly calculated). + tb_co2_with_regions = tb_co2_with_regions.sort_values(["country", "year"]).reset_index(drop=True) + + # Add new variables for each source of emissions. + for column in EMISSION_SOURCES + [ + "consumption_emissions", + "emissions_from_land_use_change", + "emissions_total_including_land_use_change", + ]: + # Add per-capita variables. + tb_co2_with_regions[f"{column}_per_capita"] = tb_co2_with_regions[column] / tb_co2_with_regions["population"] + + # Add columns for cumulative emissions. + # Rows that had nan emissions will have nan cumulative emissions. + # But nans will not be propagated in the sum. + # This means that countries with some (not all) nans will have the cumulative sum of the informed emissions + # (treating nans as zeros), but will have nan on those rows that were not informed. + tb_co2_with_regions[f"cumulative_{column}"] = tb_co2_with_regions.groupby(["country"])[column].cumsum() + + # Add share of global emissions. + tb_co2_with_regions[f"{column}_as_share_of_global"] = ( + 100 * tb_co2_with_regions[column] / tb_co2_with_regions[f"global_{column}"] + ) + + # Add share of global cumulative emissions. + tb_co2_with_regions[f"cumulative_{column}_as_share_of_global"] = ( + 100 * tb_co2_with_regions[f"cumulative_{column}"] / tb_co2_with_regions[f"global_cumulative_{column}"] + ) + + # Add total emissions per unit energy (in kg of emissions per kWh). + tb_co2_with_regions["emissions_total_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total"] + / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) + ) + + # Add total emissions (including land-use change) per unit energy (in kg of emissions per kWh). + tb_co2_with_regions["emissions_total_including_land_use_change_per_unit_energy"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total_including_land_use_change"] + / (tb_co2_with_regions["primary_energy_consumption"] * TWH_TO_KWH) + ) + + # Add total emissions per unit GDP. + tb_co2_with_regions["emissions_total_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["emissions_total"] / tb_co2_with_regions["gdp"] + ) + + # Add total emissions (including land-use change) per unit GDP. + tb_co2_with_regions["emissions_total_including_land_use_change_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 + * tb_co2_with_regions["emissions_total_including_land_use_change"] + / tb_co2_with_regions["gdp"] + ) + + # Add total consumption emissions per unit GDP. + tb_co2_with_regions["consumption_emissions_per_gdp"] = ( + TONNES_OF_CO2_TO_KG_OF_CO2 * tb_co2_with_regions["consumption_emissions"] / tb_co2_with_regions["gdp"] + ) + + # Add variable of emissions embedded in trade. + tb_co2_with_regions["traded_emissions"] = ( + tb_co2_with_regions["consumption_emissions"] - tb_co2_with_regions["emissions_total"] + ) + tb_co2_with_regions["pct_traded_emissions"] = ( + 100 * tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["emissions_total"] + ) + tb_co2_with_regions["traded_emissions_per_capita"] = ( + tb_co2_with_regions["traded_emissions"] / tb_co2_with_regions["population"] + ) + + # Add annual percentage growth of total emissions. + tb_co2_with_regions["pct_growth_emissions_total"] = ( + tb_co2_with_regions.groupby("country", observed=True)["emissions_total"].pct_change(fill_method=None) * 100 + ) + + # Add annual percentage growth of total emissions (including land-use change). + tb_co2_with_regions["pct_growth_emissions_total_including_land_use_change"] = ( + tb_co2_with_regions.groupby("country")["emissions_total_including_land_use_change"].pct_change(fill_method=None) + * 100 + ) + + # Add annual absolute growth of total emissions. + tb_co2_with_regions["growth_emissions_total"] = tb_co2_with_regions.groupby("country")["emissions_total"].diff() + + # Add annual absolute growth of total emissions (including land-use change). + tb_co2_with_regions["growth_emissions_total_including_land_use_change"] = tb_co2_with_regions.groupby("country")[ + "emissions_total_including_land_use_change" + ].diff() + + # Create variable of population as a share of global population. + tb_co2_with_regions["population_as_share_of_global"] = ( + tb_co2_with_regions["population"] / tb_co2_with_regions["global_population"] * 100 + ) + + # Remove temporary columns of global emissions. + tb_co2_with_regions = tb_co2_with_regions.drop( + columns=[column for column in tb_co2_with_regions.columns if column.startswith("global_")] + ) + + # Empty rows of international transport if international aviation and shipping are already informed. + # First find the list of columns where this happens. + international_entities = [entity for entity in set(tb_co2_with_regions["country"]) if "International" in entity] + check = tb_co2_with_regions[tb_co2_with_regions["country"].isin(international_entities)].reset_index(drop=True) + # Check that the only columns where international transport, aviation and shipping are all informed are columns + # derived from total emissions. + columns_with_redundant_international_emissions = [ + column + for column in check.drop(columns=["country", "year"]).columns + if set(check.dropna(subset=column)["country"]) == set(international_entities) + ] + error = ( + "Unexpected columns where international transport is informed as well as international aviation and shipping." + ) + assert all(["emissions_total" in column for column in columns_with_redundant_international_emissions]), error + # Now for those columns, make international transport nan. + for column in columns_with_redundant_international_emissions: + tb_co2_with_regions.loc[tb_co2_with_regions["country"] == "International transport", column] = np.nan + + # Replace infinity values (for example when calculating growth from zero to non-zero) in the data by nan. + for column in tb_co2_with_regions.drop(columns=["country", "year"]).columns: + tb_co2_with_regions.loc[np.isinf(tb_co2_with_regions[column]), column] = np.nan + + # For special GCP countries/regions (e.g. "Europe (GCP)") we should keep only the original data. + # Therefore, make nan all additional variables for those countries/regions, and keep only GCP's original data. + added_variables = tb_co2_with_regions.drop( + columns=["country", "year"] + COLUMNS_THAT_MUST_HAVE_DATA + ).columns.tolist() + tb_co2_with_regions.loc[ + (tb_co2_with_regions["country"].str.contains(" (GCP)", regex=False)), added_variables + ] = np.nan + + # Remove uninformative rows (those that have only data for, say, gdp, but not for variables related to emissions). + tb_co2_with_regions = tb_co2_with_regions.dropna(subset=COLUMNS_THAT_MUST_HAVE_DATA, how="all").reset_index( + drop=True + ) + + # Ensure that there are no rows that only have nan values. + tb_co2_with_regions = tb_co2_with_regions.dropna( + subset=tb_co2_with_regions.drop(columns=["country", "year"]).columns, how="all" + ) + + return tb_co2_with_regions diff --git a/etl/steps/data/grapher/gcp/2024-11-21/global_carbon_budget.py b/etl/steps/data/grapher/gcp/2024-11-21/global_carbon_budget.py new file mode 100644 index 00000000000..0eafb09f363 --- /dev/null +++ b/etl/steps/data/grapher/gcp/2024-11-21/global_carbon_budget.py @@ -0,0 +1,44 @@ +"""Load a garden dataset and create a grapher dataset. + +Some auxiliary variables will be added (where nans are filled with zeros, to avoid missing data in stacked area charts). + +""" + +import numpy as np +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("global_carbon_budget") + tb_garden = ds_garden["global_carbon_budget"] + + # + # Process data. + # + # Ensure all countries span all years (from 1750 to the latest observation), even if many of those rows are empty. + # This will increase the size of the dataset, but we do this so that stacked area charts span the maximum possible + # range of years. + countries = tb_garden.reset_index()["country"].unique() + years = np.arange(tb_garden.reset_index()["year"].min(), tb_garden.reset_index()["year"].max() + 1, dtype=int) + tb_garden = tb_garden.reindex(pd.MultiIndex.from_product([countries, years], names=["country", "year"])) + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, + tables=[tb_garden], + default_metadata=ds_garden.metadata, + check_variables_metadata=True, + ) + ds_grapher.save() diff --git a/etl/steps/data/meadow/gcp/2024-11-21/global_carbon_budget.py b/etl/steps/data/meadow/gcp/2024-11-21/global_carbon_budget.py new file mode 100644 index 00000000000..1289f28ad70 --- /dev/null +++ b/etl/steps/data/meadow/gcp/2024-11-21/global_carbon_budget.py @@ -0,0 +1,212 @@ +"""Load a snapshot and create a meadow dataset. + +It combines the following snapshots: +- GCP's Fossil CO2 emissions (long-format csv). +- GCP's official GCB global emissions (excel file) containing global bunker fuel and land-use change emissions. +- GCP's official GCB national emissions (excel file) containing consumption-based emissions for each country. + - Production-based emissions from this file are also used, but just to include total emissions of regions + according to GCP (e.g. "Africa (GCP)") and for sanity checks. +- GCP's official GCB national land-use change emissions (excel file) with land-use change emissions for each country. + +""" + +from owid.catalog import Table +from structlog import get_logger + +from etl.helpers import PathFinder, create_dataset + +# Initialize logger. +log = get_logger() + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def prepare_fossil_co2(tb_fossil_co2: Table) -> Table: + # Set an appropriate index and sort conveniently. + tb_fossil_co2 = tb_fossil_co2.set_index(["Country", "Year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Ensure all columns are snake-case. + tb_fossil_co2 = tb_fossil_co2.underscore() + + return tb_fossil_co2 + + +def prepare_historical_budget(tb_historical: Table) -> Table: + """Select variables and prepare the historical budget sheet of GCB's raw global data file. + + Parameters + ---------- + tb_historical : Table + Historical budget sheet of GCB's raw global data file. + + Returns + ------- + tb_historical : Table + Historical budget after selecting variables and processing them. + + """ + # Sanity check. + error = "'Historical Budget' sheet in global data file has changed (consider changing 'skiprows')." + assert tb_historical.columns[0] == "Year", error + + # Columns to select in historical budget and how to rename them. + columns = { + "Year": "year", + "fossil emissions excluding carbonation": "global_fossil_emissions", + "land-use change emissions": "global_land_use_change_emissions", + } + tb_historical = tb_historical[list(columns)].rename(columns=columns) + + # Add column for country (to be able to combine this with the national data). + tb_historical["country"] = "World" + + # Set an index and sort row and columns conveniently. + tb_historical = tb_historical.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb_historical.metadata.short_name = "global_carbon_budget_historical_budget" + + return tb_historical + + +def prepare_land_use_emissions(tb_land_use: Table) -> Table: + """Prepare data from a specific sheet of the land-use change data file. + + Parameters + ---------- + tb_land_use : Table + Data from a specific sheet of the land-use change emissions data file. + + Returns + ------- + tb_land_use : Table + Processed land-use change emissions data. + + """ + tb_land_use = tb_land_use.copy() + + # Sanity check. + error = "'BLUE' sheet in national land-use change data file has changed (consider changing 'skiprows')." + assert tb_land_use.columns[1] == "Afghanistan", error + + # Rename year column. + tb_land_use = tb_land_use.rename(columns={tb_land_use.columns[0]: "year"}) + + # Ignore countries that have no data. + tb_land_use = tb_land_use.dropna(axis=1, how="all") + + # Remove rows that are either empty, or have some other additional operation (e.g. 2013-2022). + tb_land_use = tb_land_use[tb_land_use["year"].astype(str).str.match(r"^\d{4}$")].reset_index(drop=True) + + # Restructure data to have a column for country and another for emissions. + tb_land_use = tb_land_use.melt(id_vars="year", var_name="country", value_name="emissions") + + # Set an index and sort row and columns conveniently. + tb_land_use = tb_land_use.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb_land_use.metadata.short_name = "global_carbon_budget_land_use_change" + + return tb_land_use + + +def prepare_national_emissions(tb: Table, column_name: str) -> Table: + """Select variables and prepare the territorial emissions (or the consumption emissions) sheet of GCB's raw national + data file. + + Parameters + ---------- + tb : Table + Territorial emissions (or consumption emissions) sheet of GCB's raw national data file. + column_name : str + Name to assign to emissions column to be generated. + + Returns + ------- + tb_national : Table + Processed territorial (or consumption) emissions sheet of GCB's raw national data file. + + """ + tb = tb.copy() + + error = f"Sheet in national data file for {column_name} has changed (consider changing 'skiprows')." + assert tb.columns[1] == "Afghanistan", error + + # The zeroth column is expected to be year. + tb = tb.rename(columns={tb.columns[0]: "year"}) + + # Each column represents a country; then the final columns are regions, "Bunkers", and "Statistical Difference". + # Keep "Bunkers", but remove "Statistical Difference" (which is almost completely empty). + # In fact "Bunkers" is a global variable (I don't know why it is included at the national level), but this will be + # handled at the garden step. + + # Remove unnecessary column. + tb = tb.drop(columns=["Statistical Difference"]) + + # Convert from wide to long format dataframe. + tb = tb.melt(id_vars=["year"]).rename(columns={"variable": "country", "value": column_name}) + + # Set an index and sort row and columns conveniently. + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Rename table. + tb.metadata.short_name = f"global_carbon_budget_{column_name}" + + return tb + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshots. + snap_fossil_co2 = paths.load_snapshot("global_carbon_budget_fossil_co2_emissions.csv") + snap_global = paths.load_snapshot("global_carbon_budget_global_emissions.xlsx") + snap_national = paths.load_snapshot("global_carbon_budget_national_emissions.xlsx") + snap_land_use = paths.load_snapshot("global_carbon_budget_land_use_change_emissions.xlsx") + + # Load data from fossil CO2 emissions. + tb_fossil_co2 = snap_fossil_co2.read() + + # Load historical budget from the global emissions file. + tb_historical = snap_global.read(sheet_name="Historical Budget", skiprows=15) + + # Load land-use emissions. + tb_land_use = snap_land_use.read(sheet_name="BLUE", skiprows=7) + + # Load production-based national emissions. + tb_production = snap_national.read(sheet_name="Territorial Emissions", skiprows=11) + + # Load consumption-based national emissions. + tb_consumption = snap_national.read(sheet_name="Consumption Emissions", skiprows=8) + + # + # Process data. + # + # Prepare data for fossil CO2 emissions. + tb_fossil_co2 = prepare_fossil_co2(tb_fossil_co2=tb_fossil_co2) + + # Prepare data for historical emissions. + tb_historical = prepare_historical_budget(tb_historical=tb_historical) + + # Prepare data for land-use emissions. + tb_land_use = prepare_land_use_emissions(tb_land_use=tb_land_use) + + # Prepare data for production-based emissions, from the file of national emissions. + tb_production = prepare_national_emissions(tb=tb_production, column_name="production_emissions") + + # Prepare data for consumption-based emissions, from the file of national emissions. + tb_consumption = prepare_national_emissions(tb=tb_consumption, column_name="consumption_emissions") + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, + tables=[tb_fossil_co2, tb_historical, tb_land_use, tb_production, tb_consumption], + default_metadata=snap_fossil_co2.metadata, + check_variables_metadata=True, + ) + ds_meadow.save() diff --git a/etl/steps/export/github/co2_data/latest/owid_co2.py b/etl/steps/export/github/co2_data/latest/owid_co2.py index bdc6bc1f75a..bde982f093b 100644 --- a/etl/steps/export/github/co2_data/latest/owid_co2.py +++ b/etl/steps/export/github/co2_data/latest/owid_co2.py @@ -1,6 +1,6 @@ """Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. -Datasets combined: +The combined datasets are: * Global Carbon Budget - Global Carbon Project. * National contributions to climate change - Jones et al. * Greenhouse gas emissions by sector - Climate Watch. @@ -9,353 +9,213 @@ Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2023) on GDP are included. -""" - +Outputs that will be committed to a branch in the co2-data repository: +* The main data file (as a .csv file). +* The codebook (as a .csv file). +* The README file. +""" import os +import tempfile +from pathlib import Path -import numpy as np -from owid.catalog import Dataset, Origin, Table +import pandas as pd +from owid.catalog import Table +from structlog import get_logger from apps.owidbot import github_utils as gh from etl.helpers import PathFinder +# Initialize logger. +log = get_logger() + # Get paths and naming conventions for current step. paths = PathFinder(__file__) -# Conversion factor from tonnes to million tonnes. -TONNES_TO_MILLION_TONNES = 1e-6 - -# Select columns to use from each dataset, and how to rename them. -GCP_COLUMNS = { - "country": "country", - "year": "year", - "emissions_total": "co2", - "emissions_total_per_capita": "co2_per_capita", - "traded_emissions": "trade_co2", - "emissions_from_cement": "cement_co2", - "emissions_from_cement_per_capita": "cement_co2_per_capita", - "emissions_from_coal": "coal_co2", - "emissions_from_coal_per_capita": "coal_co2_per_capita", - "emissions_from_flaring": "flaring_co2", - "emissions_from_flaring_per_capita": "flaring_co2_per_capita", - "emissions_from_gas": "gas_co2", - "emissions_from_gas_per_capita": "gas_co2_per_capita", - "emissions_from_oil": "oil_co2", - "emissions_from_oil_per_capita": "oil_co2_per_capita", - "emissions_from_other_industry": "other_industry_co2", - "emissions_from_other_industry_per_capita": "other_co2_per_capita", - "pct_growth_emissions_total": "co2_growth_prct", - "growth_emissions_total": "co2_growth_abs", - "emissions_total_per_gdp": "co2_per_gdp", - "emissions_total_per_unit_energy": "co2_per_unit_energy", - "consumption_emissions": "consumption_co2", - "consumption_emissions_per_capita": "consumption_co2_per_capita", - "consumption_emissions_per_gdp": "consumption_co2_per_gdp", - "cumulative_emissions_total": "cumulative_co2", - "cumulative_emissions_from_cement": "cumulative_cement_co2", - "cumulative_emissions_from_coal": "cumulative_coal_co2", - "cumulative_emissions_from_flaring": "cumulative_flaring_co2", - "cumulative_emissions_from_gas": "cumulative_gas_co2", - "cumulative_emissions_from_oil": "cumulative_oil_co2", - "cumulative_emissions_from_other_industry": "cumulative_other_co2", - "pct_traded_emissions": "trade_co2_share", - "emissions_total_as_share_of_global": "share_global_co2", - "emissions_from_cement_as_share_of_global": "share_global_cement_co2", - "emissions_from_coal_as_share_of_global": "share_global_coal_co2", - "emissions_from_flaring_as_share_of_global": "share_global_flaring_co2", - "emissions_from_gas_as_share_of_global": "share_global_gas_co2", - "emissions_from_oil_as_share_of_global": "share_global_oil_co2", - "emissions_from_other_industry_as_share_of_global": "share_global_other_co2", - "cumulative_emissions_total_as_share_of_global": "share_global_cumulative_co2", - "cumulative_emissions_from_cement_as_share_of_global": "share_global_cumulative_cement_co2", - "cumulative_emissions_from_coal_as_share_of_global": "share_global_cumulative_coal_co2", - "cumulative_emissions_from_flaring_as_share_of_global": "share_global_cumulative_flaring_co2", - "cumulative_emissions_from_gas_as_share_of_global": "share_global_cumulative_gas_co2", - "cumulative_emissions_from_oil_as_share_of_global": "share_global_cumulative_oil_co2", - "cumulative_emissions_from_other_industry_as_share_of_global": "share_global_cumulative_other_co2", - # New variables, related to land-use change emissions. - "cumulative_emissions_from_land_use_change": "cumulative_luc_co2", - "cumulative_emissions_from_land_use_change_as_share_of_global": "share_global_cumulative_luc_co2", - "cumulative_emissions_total_including_land_use_change": "cumulative_co2_including_luc", - "cumulative_emissions_total_including_land_use_change_as_share_of_global": "share_global_cumulative_co2_including_luc", - "emissions_from_land_use_change": "land_use_change_co2", - "emissions_from_land_use_change_as_share_of_global": "share_global_luc_co2", - "emissions_from_land_use_change_per_capita": "land_use_change_co2_per_capita", - "emissions_total_including_land_use_change": "co2_including_luc", - "emissions_total_including_land_use_change_as_share_of_global": "share_global_co2_including_luc", - "emissions_total_including_land_use_change_per_capita": "co2_including_luc_per_capita", - "emissions_total_including_land_use_change_per_gdp": "co2_including_luc_per_gdp", - "emissions_total_including_land_use_change_per_unit_energy": "co2_including_luc_per_unit_energy", - "growth_emissions_total_including_land_use_change": "co2_including_luc_growth_abs", - "pct_growth_emissions_total_including_land_use_change": "co2_including_luc_growth_prct", -} -JONES_COLUMNS = { - "country": "country", - "year": "year", - "temperature_response_co2_total": "temperature_change_from_co2", - "temperature_response_ghg_total": "temperature_change_from_ghg", - "temperature_response_ch4_total": "temperature_change_from_ch4", - "temperature_response_n2o_total": "temperature_change_from_n2o", - "share_of_temperature_response_ghg_total": "share_of_temperature_change_from_ghg", -} -CLIMATE_WATCH_GHG_COLUMNS = { - "country": "country", - "year": "year", - "total_ghg_emissions_excluding_lucf": "total_ghg_excluding_lucf", - "total_ghg_emissions_excluding_lucf_per_capita": "ghg_excluding_lucf_per_capita", - "total_ghg_emissions_including_lucf": "total_ghg", - "total_ghg_emissions_including_lucf_per_capita": "ghg_per_capita", -} -CLIMATE_WATCH_CH4_COLUMNS = { - "country": "country", - "year": "year", - "total_ch4_emissions_including_lucf": "methane", - "total_ch4_emissions_including_lucf_per_capita": "methane_per_capita", -} -CLIMATE_WATCH_N2O_COLUMNS = { - "country": "country", - "year": "year", - "total_n2o_emissions_including_lucf": "nitrous_oxide", - "total_n2o_emissions_including_lucf_per_capita": "nitrous_oxide_per_capita", -} -PRIMARY_ENERGY_COLUMNS = { - "country": "country", - "year": "year", - "primary_energy_consumption__twh": "primary_energy_consumption", - "primary_energy_consumption_per_capita__kwh": "energy_per_capita", - "primary_energy_consumption_per_gdp__kwh_per_dollar": "energy_per_gdp", -} -REGIONS_COLUMNS = { - "name": "country", - "iso_alpha3": "iso_code", -} -POPULATION_COLUMNS = { - "country": "country", - "year": "year", - "population": "population", -} -GDP_COLUMNS = { - "country": "country", - "year": "year", - "gdp": "gdp", -} - -UNITS = {"tonnes": {"conversion": TONNES_TO_MILLION_TONNES, "new_unit": "million tonnes", "new_short_unit": "Mt"}} - - -def convert_units(table: Table) -> Table: - """Convert units of table. - - Parameters - ---------- - table : Table - Data with its original units. - - Returns - ------- - Table - Data after converting units of specific columns. - - """ - table = table.copy() - # Check units and convert to more convenient ones. - for column in table.columns: - unit = table[column].metadata.unit - title = table[column].metadata.title - description_short = table[column].metadata.description or table[column].metadata.description_short - if unit in list(UNITS): - table[column] *= UNITS[unit]["conversion"] - table[column].metadata.unit = UNITS[unit]["new_unit"] - table[column].metadata.short_unit = UNITS[unit]["new_short_unit"] - table[column].metadata.title = title.replace(unit, UNITS[unit]["new_unit"]) - table[column].metadata.description_short = description_short.replace(unit, UNITS[unit]["new_unit"]) - - return table - - -def combine_tables( - tb_gcp: Table, - tb_jones: Table, - tb_climate_watch_ghg: Table, - tb_climate_watch_ch4: Table, - tb_climate_watch_n2o: Table, - tb_energy: Table, - tb_gdp: Table, - tb_population: Table, - tb_regions: Table, -) -> Table: - """Combine tables. - - Parameters - ---------- - tb_gcp : Table - Global Carbon Budget table (from Global Carbon Project). - tb_jones : Table - National contributions to climate change (from Jones et al. (2023)). - tb_climate_watch_ghg : Table - Greenhouse gas emissions table (from Climate Watch). - tb_climate_watch_ch4 : Table - CH4 emissions table (from Climate Watch). - tb_climate_watch_n2o : Table - N2O emissions table (from Climate Watch). - tb_energy : Table - Primary energy consumption table (from BP & EIA). - tb_gdp : Table - Maddison GDP table (from GGDC). - tb_population : Table - OWID population table (from various sources). - tb_regions : Table - OWID regions table. - - Returns - ------- - combined : Table - Combined table with metadata and variables metadata. - - """ - # Combine main tables (with an outer join, to gather all entities from all tables). - combined = tb_gcp.copy() - for table in [tb_jones, tb_climate_watch_ghg, tb_climate_watch_ch4, tb_climate_watch_n2o]: - combined = combined.merge(table, on=["country", "year"], how="outer", short_name=paths.short_name) - - # Add secondary tables (with a left join, to keep only entities for which we have emissions data). - for table in [tb_energy, tb_gdp, tb_population]: - combined = combined.merge(table, on=["country", "year"], how="left") - - # Countries-regions dataset does not have a year column, so it has to be merged on country. - combined = combined.merge(tb_regions, on="country", how="left") - - # Check that there were no repetition in column names. - error = "Repeated columns in combined data." - assert len([column for column in set(combined.columns) if "_x" in column]) == 0, error - - # Adjust units. - combined = convert_units(combined) - - return combined - - -def prepare_outputs(combined: Table, ds_regions: Dataset) -> Table: - """Clean and prepare output table. - - Parameters - ---------- - combined : Table - Combined table. - ds_regions : Dataset - Regions dataset, only used to get its version. - - Returns - ------- - combined: Table - Cleaned combined table. - - """ - # Remove rows that only have nan (ignoring if country, year, iso_code, population and gdp do have data). - columns_that_must_have_data = [ - column for column in combined.columns if column not in ["country", "year", "iso_code", "population", "gdp"] - ] - combined = combined.dropna(subset=columns_that_must_have_data, how="all").reset_index(drop=True) - - # Add metadata to the ISO column (loaded from the regions dataset). - combined["iso_code"].m.origins = [ - Origin( - producer="International Organization for Standardization", - title="Regions", - date_published=ds_regions.version, - ) - ] - combined["iso_code"].metadata.title = "ISO code" - combined["iso_code"].metadata.description_short = "ISO 3166-1 alpha-3 three-letter country codes." - combined["iso_code"].metadata.unit = "" - - # Sanity check. - columns_with_inf = [column for column in combined.columns if len(combined[combined[column] == np.inf]) > 0] - assert len(columns_with_inf) == 0, f"Infinity values detected in columns: {columns_with_inf}" - - # Set index and sort conveniently. - combined = combined.set_index(["country", "year"], verify_integrity=True).sort_index() - - return combined +def prepare_readme(tb: Table) -> str: + # NOTE: In a future update, we could figure out a way to generate the main content of the README from the table's metadata (possibly with the help of VersionTracker). + # origins = {origin.title_snapshot or origin.title: origin for origin in set(sum([tb[column].metadata.origins for column in tb.columns], []))} + readme = """\ +# Data on CO2 and Greenhouse Gas Emissions by *Our World in Data* + +Our complete CO2 and Greenhouse Gas Emissions dataset is a collection of key metrics maintained by [*Our World in Data*](https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions). It is updated regularly and includes data on CO2 emissions (annual, per capita, cumulative and consumption-based), other greenhouse gases, energy mix, and other relevant metrics. + +## The complete *Our World in Data* CO2 and Greenhouse Gas Emissions dataset + +### 🗂️ Download our complete CO2 and Greenhouse Gas Emissions dataset : [CSV](https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.csv) | [XLSX](https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.xlsx) | [JSON](https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.json) + +The CSV and XLSX files follow a format of 1 row per location and year. The JSON version is split by country, with an array of yearly records. + +The indicators represent all of our main data related to CO2 emissions, other greenhouse gas emissions, energy mix, as well as other indicators of potential interest. + +We will continue to publish updated data on CO2 and Greenhouse Gas Emissions as it becomes available. Most metrics are published on an annual basis. + +A [full codebook](https://github.com/owid/co2-data/blob/master/owid-co2-codebook.csv) is made available, with a description and source for each indicator in the dataset. This codebook is also included as an additional sheet in the XLSX file. + +## Our source data and code + +The dataset is built upon a number of datasets and processing steps: + +- Statistical review of world energy (Energy Institute, EI): + - [Source data](https://www.energyinst.org/statistical-review) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/energy_institute/2024-06-20/statistical_review_of_world_energy.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/energy_institute/2024-06-20/statistical_review_of_world_energy.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/energy_institute/2024-06-20/statistical_review_of_world_energy.py) +- International energy data (U.S. Energy Information Administration, EIA): + - [Source data](https://www.eia.gov/opendata/bulkfiles.php) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/eia/2023-12-12/international_energy_data.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/eia/2023-12-12/energy_consumption.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/eia/2023-12-12/energy_consumption.py) +- Primary energy consumption (Our World in Data based on EI's Statistical review of world energy & EIA's International energy data): + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/energy/2024-06-20/primary_energy_consumption.py) +- Global carbon budget - Fossil CO2 emissions (Global Carbon Project): + - [Source data](https://zenodo.org/records/13981696/files/GCB2024v17_MtCO2_flat.csv) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/gcp/2024-11-13/global_carbon_budget.py) +- Global carbon budget - Global carbon emissions (Global Carbon Project): + - [Source data](https://globalcarbonbudgetdata.org/downloads/jGJH0-data/Global_Carbon_Budget_2024_v1.0.xlsx) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/gcp/2024-11-13/global_carbon_budget.py) +- Global carbon budget - National fossil carbon emissions (Global Carbon Project): + - [Source data](https://globalcarbonbudgetdata.org/downloads/jGJH0-data/National_Fossil_Carbon_Emissions_2024v1.0.xlsx) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/gcp/2024-11-13/global_carbon_budget.py) +- Global carbon budget - National land-use change carbon emissions (Global Carbon Project): + - [Source data](https://globalcarbonbudgetdata.org/downloads/jGJH0-data/National_LandUseChange_Carbon_Emissions_2024v1.0.xlsx) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/gcp/2024-11-13/global_carbon_budget.py) +- Global carbon budget (Our World in Data based on the Global Carbon Project's Fossil CO2 emissions, Global carbon emissions, National fossil carbon emissions, and National land-use change emissions): + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/gcp/2024-11-13/global_carbon_budget.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/gcp/2024-11-13/global_carbon_budget.py) +- National contributions to climate change (Jones et al. (2024)): + - [Source data](https://zenodo.org/records/7636699/latest) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/emissions/2024-04-08/national_contributions.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/emissions/2024-04-08/national_contributions.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/emissions/2024-04-08/national_contributions.py) +- Greenhouse gas emissions (including methane and nitrous oxide) by sector (Climate Watch): + - [Source data](https://www.climatewatchdata.org/ghg-emissions) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/climate_watch/2023-10-31/emissions_by_sector.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/climate_watch/2023-10-31/emissions_by_sector.py) + - [Further processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/climate_watch/2023-10-31/emissions_by_sector.py) +- CO2 dataset (Our World in Data based on all sources above): + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/external/co2_data/latest/owid_co2.py) + - [Exporting code](https://github.com/owid/co2-data/blob/master/scripts/make_dataset.py) + - [Uploading code](https://github.com/owid/co2-data/blob/master/scripts/upload_datasets_to_s3.py) + +Additionally, to construct indicators per capita and per GDP, we use the following datasets and processing steps: +- Regions (Our World in Data). + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/regions/2023-01-01/regions.py) +- Population (Our World in Data based on [a number of different sources](https://ourworldindata.org/population-sources)). + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/demography/2023-03-31/population/__init__.py) +- Income groups (World Bank). + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/wb/2024-03-11/income_groups.py) +- GDP (University of Groningen GGDC's Maddison Project Database, Bolt and van Zanden, 2024). + - [Source data](https://www.rug.nl/ggdc/historicaldevelopment/maddison/releases/maddison-project-database-2023) + - [Ingestion code](https://github.com/owid/etl/blob/master/snapshots/ggdc/2024-04-26/maddison_project_database.py) + - [Basic processing code](https://github.com/owid/etl/blob/master/etl/steps/data/meadow/ggdc/2024-04-26/maddison_project_database.py) + - [Processing code](https://github.com/owid/etl/blob/master/etl/steps/data/garden/ggdc/2024-04-26/maddison_project_database.py) + +## Changelog + +- 2024-11-21: + - Updated dataset (and codebook) to use the latest version of the Global Carbon Budget (2024). +- 2024-06-20: + - Update data from the Statistical Review of World Energy. + - Update data from the Maddison Project Database. +- 2024-04-10: + - Updated dataset and codebook to use the latest version of the data on National contributions to climate change (Jones et al. (2024)). +- 2023-12-28: + - Enhanced codebook (improved descriptions, added units, updated sources). + - Updated primary energy consumption (to update metadata, nothing has changed in the data). +- 2023-12-05: + - Updated dataset (and codebook) to use the latest version of the Global Carbon Budget (2023). + - In this version, "International transport" has been replaced by "International aviation" and "International shipping". Also, some overseas territories have no data in this version. More details on the changes can be found in the pdf file hosted [here](https://zenodo.org/records/10177738). +- 2023-11-08: + - Updated CO2 emissions data to use the latest emissions by sector from Climate Watch (2023). + - Update codebook accordingly. +- 2023-10-16: + - Improved codebook. + - Fixed issue related to consumption-based emissions in Africa, and Palau emissions. +- 2023-07-10: + - Updated primary energy consumption and other indicators relying on energy data, to use the latest Statistical Review of World Energy by the Energy Institute. + - Renamed countries 'East Timor' and 'Faroe Islands'. +- 2023-05-04: + - Added indicators `share_of_temperature_change_from_ghg`, `temperature_change_from_ch4`, `temperature_change_from_co2`, `temperature_change_from_ghg`, and `temperature_change_from_n2o` using data from Jones et al. (2023). +- 2022-11-11: + - Updated CO2 emissions data with the newly released Global Carbon Budget (2022) by the Global Carbon Project. + - Added various new indicators related to national land-use change emissions. + - Added the emissions of the 1991 Kuwaiti oil fires in Kuwait's emissions (while also keeping 'Kuwaiti Oil Fires (GCP)' as a separate entity), to properly account for these emissions in the aggregate of Asia. + - Applied minor changes to entity names (e.g. "Asia (excl. China & India)" -> "Asia (excl. China and India)"). +- 2022-09-06: + - Updated data on primary energy consumption (from BP & EIA) and greenhouse gas emissions by sector (from CAIT). + - Refactored code, since now this repository simply loads the data, generates the output files, and uploads them to the cloud; the code to generate the dataset is now in our [etl repository](https://github.com/owid/etl). + - Minor changes in the codebook. +- 2022-04-15: + - Updated primary energy consumption data. + - Updated CO2 data to include aggregations for the different country income levels. +- 2022-02-24: + - Updated greenhouse gas emissions data from CAIT Climate Data Explorer. + - Included two new columns in dataset: total greenhouse gases excluding land-use change and forestry, and the same as per capita values. +- 2021-11-05: Updated CO2 emissions data with the newly released Global Carbon Budget (v2021). +- 2021-09-16: + - Fixed data quality issues in CO2 emissions indicators (emissions less than 0, missing data for Eswatini, ...). + - Replaced all input CSVs with data retrieved directly from ourworldindata.org. +- 2021-02-08: Updated this dataset with the latest annual release from the Global Carbon Project. +- 2020-08-07: The first version of this dataset was made available. + +## Data alterations + +- **We standardize names of countries and regions.** Since the names of countries and regions are different in different data sources, we standardize all names in order to minimize data loss during data merges. +- **We recalculate carbon emissions to CO2.** The primary data sources on CO2 emissions—the Global Carbon Project, for example—typically report emissions in tonnes of carbon. We have recalculated these figures as tonnes of CO2 using a conversion factor of 3.664. +- **We calculate per capita figures.** All of our per capita figures are calculated from our metric `Population`, which is included in the complete dataset. These population figures are sourced from [Gapminder](http://gapminder.org) and the [UN World Population Prospects (UNWPP)](https://population.un.org/wpp/). + +## License + +All visualizations, data, and code produced by _Our World in Data_ are completely open access under the [Creative Commons BY license](https://creativecommons.org/licenses/by/4.0/). You have the permission to use, distribute, and reproduce these in any medium, provided the source and authors are credited. + +The data produced by third parties and made available by _Our World in Data_ is subject to the license terms from the original third-party authors. We will always indicate the original source of the data in our database, and you should always check the license of any such third-party data before use. + +## Authors + +This data has been collected, aggregated, and documented by Hannah Ritchie, Max Roser, Edouard Mathieu, Bobbie Macdonald and Pablo Rosado. + +The mission of *Our World in Data* is to make data and research on the world's largest problems understandable and accessible. [Read more about our mission](https://ourworldindata.org/about). + + +## How to cite this data? + +If you are using this dataset, please cite both [Our World in Data](https://ourworldindata.org/co2-and-greenhouse-gas-emissions#article-citation) and the underlying data source(s). + +Please follow [the guidelines in our FAQ](https://ourworldindata.org/faqs#citing-work-produced-by-third-parties-and-made-available-by-our-world-in-data) on how to cite our work. -def run(dest_dir: str) -> None: - # - # Load data. - # - # Load the global carbon budget dataset from the Global Carbon Project (GCP). - ds_gcp = paths.load_dataset("global_carbon_budget") - - # Load the Jones et al. (2023) dataset on national contributions to climate change. - ds_jones = paths.load_dataset("national_contributions") - - # Load the greenhouse gas emissions by sector dataset by Climate Watch. - ds_climate_watch = paths.load_dataset("emissions_by_sector") +""" + return readme - # Load the GDP dataset by GGDC Maddison. - ds_gdp = paths.load_dataset("maddison_project_database") - # Load primary energy consumption dataset (by different sources in our 'energy' namespace). - ds_energy = paths.load_dataset("primary_energy_consumption") +def prepare_and_save_outputs(tb: Table, codebook: Table, temp_dir_path: Path) -> None: + # Create codebook and save it as a csv file. + log.info("Creating codebook csv file.") + pd.DataFrame(codebook).to_csv(temp_dir_path / "owid-co2-codebook.csv", index=False) - # Load population dataset. - ds_population = paths.load_dataset("population") + # Create a csv file. + log.info("Creating csv file.") + pd.DataFrame(tb).to_csv(temp_dir_path / "owid-co2-data.csv", index=False, float_format="%.3f") - # Load countries-regions dataset (required to get ISO codes). - ds_regions = paths.load_dataset("regions") + # Create a README file. + log.info("Creating README file.") + readme = prepare_readme(tb) + (temp_dir_path / "README.md").write_text(readme) - # Gather all required tables from all datasets. - tb_gcp = ds_gcp["global_carbon_budget"] - tb_jones = ds_jones["national_contributions"] - tb_climate_watch_ghg = ds_climate_watch["greenhouse_gas_emissions_by_sector"] - tb_climate_watch_ch4 = ds_climate_watch["methane_emissions_by_sector"] - tb_climate_watch_n2o = ds_climate_watch["nitrous_oxide_emissions_by_sector"] - tb_energy = ds_energy["primary_energy_consumption"] - tb_gdp = ds_gdp["maddison_project_database"] - tb_population = ds_population["population"] - tb_regions = ds_regions["regions"] +def run(dest_dir: str) -> None: # - # Process data. + # Load data. # - # Choose required columns and rename them. - tb_gcp = tb_gcp.reset_index()[list(GCP_COLUMNS)].rename(columns=GCP_COLUMNS, errors="raise") - tb_jones = tb_jones.reset_index()[list(JONES_COLUMNS)].rename(columns=JONES_COLUMNS, errors="raise") - tb_climate_watch_ghg = tb_climate_watch_ghg.reset_index()[list(CLIMATE_WATCH_GHG_COLUMNS)].rename( - columns=CLIMATE_WATCH_GHG_COLUMNS, errors="raise" - ) - tb_climate_watch_ch4 = tb_climate_watch_ch4.reset_index()[list(CLIMATE_WATCH_CH4_COLUMNS)].rename( - columns=CLIMATE_WATCH_CH4_COLUMNS, errors="raise" - ) - tb_climate_watch_n2o = tb_climate_watch_n2o.reset_index()[list(CLIMATE_WATCH_N2O_COLUMNS)].rename( - columns=CLIMATE_WATCH_N2O_COLUMNS, errors="raise" - ) - tb_energy = tb_energy.reset_index()[list(PRIMARY_ENERGY_COLUMNS)].rename( - columns=PRIMARY_ENERGY_COLUMNS, errors="raise" - ) - tb_gdp = tb_gdp.reset_index()[list(GDP_COLUMNS)].rename(columns=GDP_COLUMNS, errors="raise") - tb_population = tb_population.reset_index()[list(POPULATION_COLUMNS)].rename( - columns=POPULATION_COLUMNS, errors="raise" - ) - tb_regions = tb_regions.reset_index()[list(REGIONS_COLUMNS)].rename(columns=REGIONS_COLUMNS, errors="raise") - - # Combine tables. - combined = combine_tables( - tb_gcp=tb_gcp, - tb_jones=tb_jones, - tb_climate_watch_ghg=tb_climate_watch_ghg, - tb_climate_watch_ch4=tb_climate_watch_ch4, - tb_climate_watch_n2o=tb_climate_watch_n2o, - tb_energy=tb_energy, - tb_gdp=tb_gdp, - tb_population=tb_population, - tb_regions=tb_regions, - ) - - # Prepare outputs. - combined = prepare_outputs(combined=combined, ds_regions=ds_regions) + # Load the owid_co2 emissions dataset from garden, and read its main table and codebook. + ds_gcp = paths.load_dataset("owid_co2") + tb = ds_gcp.read("owid_co2") + codebook = ds_gcp.read("owid_co2_codebook") + # + # Save outputs. + # # If you want to really commit the data, use `CO2_BRANCH=my-branch etlr github/co2_data --export` if os.environ.get("CO2_BRANCH"): dry_run = False @@ -364,11 +224,27 @@ def run(dest_dir: str) -> None: dry_run = True branch = "master" - gh.commit_file_to_github( - combined.to_csv(), - repo_name="co2-data", - file_path="owid-co2-data.csv", - commit_message=":bar_chart: Automated update", - branch=branch, - dry_run=dry_run, - ) + # Uncomment to inspect changes. + # from etl.data_helpers.misc import compare_tables + # branch = "update-gcb-data" + # old = pd.read_csv("https://raw.githubusercontent.com/owid/co2-data/refs/heads/master/owid-co2-data.csv") + # new = pd.read_csv(f"https://raw.githubusercontent.com/owid/co2-data/refs/heads/{branch}/owid-co2-data.csv") + # compare_tables(old, new, countries=["World"]) + + # Create a temporary directory for all files to be committed. + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + + prepare_and_save_outputs(tb, codebook=codebook, temp_dir_path=temp_dir_path) + + # Commit csv files to the repos. + for file_name in ["owid-co2-data.csv", "owid-co2-codebook.csv", "README.md"]: + with (temp_dir_path / file_name).open("r") as file_content: + gh.commit_file_to_github( + file_content.read(), + repo_name="co2-data", + file_path=file_name, + commit_message=":bar_chart: Automated update", + branch=branch, + dry_run=dry_run, + ) diff --git a/etl/steps/export/s3/co2_data/latest/owid_co2.py b/etl/steps/export/s3/co2_data/latest/owid_co2.py new file mode 100644 index 00000000000..65973768684 --- /dev/null +++ b/etl/steps/export/s3/co2_data/latest/owid_co2.py @@ -0,0 +1,132 @@ +"""Garden step that combines various datasets related to greenhouse emissions and produces the OWID CO2 dataset. + +The combined datasets are: +* Global Carbon Budget - Global Carbon Project. +* National contributions to climate change - Jones et al. +* Greenhouse gas emissions by sector - Climate Watch. +* Primary energy consumption - EI & EIA. + +Additionally, OWID's regions dataset, population dataset and Maddison Project Database (Bolt and van Zanden, 2023) on +GDP are included. + +Outputs: +* The data in three different formats will also be uploaded to S3, and will be made publicly available, in: + * https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.csv + * https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.xlsx + * https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.json + +""" +import json +import tempfile +from pathlib import Path + +import pandas as pd +from owid.catalog import Table +from owid.datautils.s3 import S3 +from structlog import get_logger +from tqdm.auto import tqdm + +from etl.helpers import PathFinder + +# Initialize logger. +log = get_logger() + +# Define S3 base URL. +S3_URL = "https://nyc3.digitaloceanspaces.com" +# Profile name to use for S3 client (as defined in .aws/config). +S3_PROFILE_NAME = "default" +# S3 bucket name and folder where dataset files will be stored. +S3_BUCKET_NAME = "owid-public" +S3_DATA_DIR = Path("data/co2") + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def save_data_to_json(tb: Table, output_path: str) -> None: + tb = tb.copy() + + # Initialize output dictionary, that contains one item per country in the data. + output_dict = {} + + # Each country contains a dictionary, which contains: + # * "iso_code", which is the ISO code (as a string), if it exists. + # * "data", which is a list of dictionaries, one per year. + # Each dictionary contains "year" as the first item, followed by all other non-nan indicator values for that year. + for country in sorted(set(tb["country"])): + # Initialize output dictionary for current country. + output_dict[country] = {} + + # If there is an ISO code for this country, add it as a new item of the dictionary. + iso_code = tb[tb["country"] == country].iloc[0]["iso_code"] + if not pd.isna(iso_code): + output_dict[country]["iso_code"] = iso_code + + # Create the data dictionary for this country. + dict_country = tb[tb["country"] == country].drop(columns=["country", "iso_code"]).to_dict(orient="records") + # Remove all nans. + data_country = [ + {indicator: value for indicator, value in d_year.items() if not pd.isna(value)} for d_year in dict_country + ] + output_dict[country]["data"] = data_country + + # Write dictionary to file as a big json object. + with open(output_path, "w") as file: + file.write(json.dumps(output_dict, indent=4)) + + +def prepare_and_save_outputs(tb: Table, codebook: Table, temp_dir_path: Path) -> None: + # Create a csv file. + log.info("Creating csv file.") + pd.DataFrame(tb).to_csv(temp_dir_path / "owid-co2-data.csv", index=False, float_format="%.3f") + + # Create a json file. + log.info("Creating json file.") + save_data_to_json(tb, temp_dir_path / "owid-co2-data.json") + + # Create an excel file. + log.info("Creating excel file.") + with pd.ExcelWriter(temp_dir_path / "owid-co2-data.xlsx") as writer: + tb.to_excel(writer, sheet_name="Data", index=False, float_format="%.3f") + codebook.to_excel(writer, sheet_name="Metadata") + + +def run(dest_dir: str) -> None: + # + # Load data. + # + # Load the owid_co2 emissions dataset from garden, and read its main table. + ds_gcp = paths.load_dataset("owid_co2") + tb = ds_gcp.read("owid_co2") + codebook = ds_gcp.read("owid_co2_codebook") + + # + # Save outputs. + # + # Create a temporary directory for all files to be committed. + with tempfile.TemporaryDirectory() as temp_dir: + ################################################################################################################ + # TODO: Create new public files and update the way we write to them. + log.warning( + "This implementation currently does not work. We should create an R2 public bucket and update the way we write to it. For now, manually update files in Digital Ocean using the web interface." + ) + ################################################################################################################ + + temp_dir_path = Path(temp_dir) + + prepare_and_save_outputs(tb, codebook=codebook, temp_dir_path=temp_dir_path) + + # Initialise S3 client. + s3 = S3(profile_name=S3_PROFILE_NAME) + for file_name in tqdm(["owid-co2-data.csv", "owid-co2-data.xlsx", "owid-co2-data.json"]): + # Path to local file. + local_file = temp_dir_path / file_name + # Path (within bucket) to S3 file. + s3_file = Path("data/co2") / file_name + tqdm.write(f"Uploading file {local_file} to S3 bucket {S3_BUCKET_NAME} as {s3_file}.") + # Upload and make public each of the files. + s3.upload_to_s3( + local_path=str(local_file), + s3_path=f"s3://{S3_BUCKET_NAME}/{str(s3_file)}", + public=True, + ) diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget.py b/snapshots/gcp/2024-11-13/global_carbon_budget.py index 5821b60d6b3..f5364b94ee7 100644 --- a/snapshots/gcp/2024-11-13/global_carbon_budget.py +++ b/snapshots/gcp/2024-11-13/global_carbon_budget.py @@ -42,8 +42,7 @@ @click.command() @click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") -@click.option("--path-to-folder", prompt=True, type=str, help="Path to local folder where data files are.") -def main(path_to_folder: str, upload: bool) -> None: +def main(upload: bool) -> None: # Create a new snapshot for each dataset. for data_file in DATA_FILES: snap = Snapshot(f"gcp/{SNAPSHOT_VERSION}/{data_file}") @@ -58,13 +57,7 @@ def main(path_to_folder: str, upload: bool) -> None: snap.metadata_path.write_text(snap.metadata.to_yaml()) # Download data from source, add file to DVC and upload to S3. - ################################################################################################################ - # snap.create_snapshot(upload=upload) - # TODO: Once public, remove this, uncomment previous, and remove click.option for path to folder. - path_to_file = Path(path_to_folder) / data_file - assert path_to_file.exists(), f"File {path_to_file} does not exist." - snap.create_snapshot(filename=path_to_file, upload=upload) - ################################################################################################################ + snap.create_snapshot(upload=upload) if __name__ == "__main__": diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv.dvc b/snapshots/gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv.dvc index 828af5c0f90..feba80ce3c0 100644 --- a/snapshots/gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv.dvc +++ b/snapshots/gcp/2024-11-13/global_carbon_budget_fossil_co2_emissions.csv.dvc @@ -17,12 +17,13 @@ meta: attribution: Global Carbon Budget (2024) attribution_short: GCB url_main: https://globalcarbonbudget.org/ + url_download: https://zenodo.org/records/13981696/files/GCB2024v17_MtCO2_flat.csv date_accessed: '2024-11-13' date_published: '2024-11-13' license: name: CC BY 4.0 url: https://zenodo.org/records/10177738 outs: - - md5: eefcbe53b9da64d615a170970496c7c1 - size: 2869860 + - md5: 70dac1843444b14655bf756c70c1f04a + size: 3128569 path: global_carbon_budget_fossil_co2_emissions.csv diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx.dvc b/snapshots/gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx.dvc index ae93d96b8c4..edaaf1bd695 100644 --- a/snapshots/gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx.dvc +++ b/snapshots/gcp/2024-11-13/global_carbon_budget_global_emissions.xlsx.dvc @@ -17,12 +17,13 @@ meta: attribution: Global Carbon Budget (2024) attribution_short: GCB url_main: https://globalcarbonbudget.org/ + url_download: https://globalcarbonbudgetdata.org/downloads/jGJH0-data/Global_Carbon_Budget_2024_v1.0.xlsx date_accessed: '2024-11-13' date_published: '2024-11-13' license: name: CC BY 4.0 url: https://www.icos-cp.eu/data-services/about-data-portal/data-license outs: - - md5: 5f390794f439c38bd343ded9c6872166 - size: 454431 + - md5: ba4ef8c16f172438e1ae283f20ef92e1 + size: 406583 path: global_carbon_budget_global_emissions.xlsx diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx.dvc b/snapshots/gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx.dvc index 6e7a9a4117c..cfb7e546564 100644 --- a/snapshots/gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx.dvc +++ b/snapshots/gcp/2024-11-13/global_carbon_budget_land_use_change_emissions.xlsx.dvc @@ -17,12 +17,13 @@ meta: attribution: Global Carbon Budget (2024) attribution_short: GCB url_main: https://globalcarbonbudget.org/ + url_download: https://globalcarbonbudgetdata.org/downloads/jGJH0-data/National_LandUseChange_Carbon_Emissions_2024v1.0.xlsx date_accessed: '2024-11-13' date_published: '2024-11-13' license: name: CC BY 4.0 url: https://www.icos-cp.eu/data-services/about-data-portal/data-license outs: - - md5: 0ac3b67c3f24414f983913a5d227f1cd - size: 1592979 + - md5: 3415714f06bf3c00dbd675e40a73152b + size: 1257748 path: global_carbon_budget_land_use_change_emissions.xlsx diff --git a/snapshots/gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx.dvc b/snapshots/gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx.dvc index 279b90125de..d0399880bda 100644 --- a/snapshots/gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx.dvc +++ b/snapshots/gcp/2024-11-13/global_carbon_budget_national_emissions.xlsx.dvc @@ -17,12 +17,13 @@ meta: attribution: Global Carbon Budget (2024) attribution_short: GCB url_main: https://globalcarbonbudget.org/ + url_download: https://globalcarbonbudgetdata.org/downloads/jGJH0-data/National_Fossil_Carbon_Emissions_2024v1.0.xlsx date_accessed: '2024-11-13' date_published: '2024-11-13' license: name: CC BY 4.0 url: https://www.icos-cp.eu/data-services/about-data-portal/data-license outs: - - md5: ca5d4bc5b7b29255129d3e9df51548ad - size: 818314 + - md5: 15f157d2f4c6770c85883a33beec954c + size: 724957 path: global_carbon_budget_national_emissions.xlsx diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget.py b/snapshots/gcp/2024-11-21/global_carbon_budget.py new file mode 100644 index 00000000000..5821b60d6b3 --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget.py @@ -0,0 +1,71 @@ +"""Script to create snapshots of the Global Carbon Budget data products. + +A snapshot will be created for each of the following datasets: +* Global Carbon Budget - Fossil CO2 emissions. +* Global Carbon Budget - Global emissions. +* Global Carbon Budget - Land-use change emissions. +* Global Carbon Budget - National emissions. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + +# Names of input data files to create snapshots for. +DATA_FILES = [ + "global_carbon_budget_fossil_co2_emissions.csv", + "global_carbon_budget_global_emissions.xlsx", + "global_carbon_budget_land_use_change_emissions.xlsx", + "global_carbon_budget_national_emissions.xlsx", +] + +# Define common metadata fields (to be written to dvc files). +ATTRIBUTION = "Global Carbon Budget (2024)" +ATTRIBUTION_SHORT = "GCB" +CITATION_FULL = """Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + +The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + +For more details, see the original paper: +Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023.""" + +DESCRIPTION = """The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + +The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies.""" + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-folder", prompt=True, type=str, help="Path to local folder where data files are.") +def main(path_to_folder: str, upload: bool) -> None: + # Create a new snapshot for each dataset. + for data_file in DATA_FILES: + snap = Snapshot(f"gcp/{SNAPSHOT_VERSION}/{data_file}") + + # Replace the full citation and description in the metadata. + snap.metadata.origin.attribution = ATTRIBUTION # type: ignore + snap.metadata.origin.attribution_short = ATTRIBUTION_SHORT # type: ignore + snap.metadata.origin.citation_full = CITATION_FULL # type: ignore + snap.metadata.origin.description = DESCRIPTION # type: ignore + + # Rewrite metadata to dvc file. + snap.metadata_path.write_text(snap.metadata.to_yaml()) + + # Download data from source, add file to DVC and upload to S3. + ################################################################################################################ + # snap.create_snapshot(upload=upload) + # TODO: Once public, remove this, uncomment previous, and remove click.option for path to folder. + path_to_file = Path(path_to_folder) / data_file + assert path_to_file.exists(), f"File {path_to_file} does not exist." + snap.create_snapshot(filename=path_to_file, upload=upload) + ################################################################################################################ + + +if __name__ == "__main__": + main() diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv.dvc b/snapshots/gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv.dvc new file mode 100644 index 00000000000..ee398009771 --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget_fossil_co2_emissions.csv.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Fossil CO2 emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + date_accessed: '2024-11-21' + date_published: '2024-11-21' + license: + name: CC BY 4.0 + url: https://zenodo.org/records/10177738 +outs: + - md5: eefcbe53b9da64d615a170970496c7c1 + size: 2869860 + path: global_carbon_budget_fossil_co2_emissions.csv diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx.dvc b/snapshots/gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx.dvc new file mode 100644 index 00000000000..91040f5e5a9 --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget_global_emissions.xlsx.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Global emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + date_accessed: '2024-11-21' + date_published: '2024-11-21' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: ad8155e5db112173f5ab465205e30680 + size: 941480 + path: global_carbon_budget_global_emissions.xlsx diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx.dvc b/snapshots/gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx.dvc new file mode 100644 index 00000000000..ee07763c9d0 --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget_land_use_change_emissions.xlsx.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - Land-use change emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + date_accessed: '2024-11-21' + date_published: '2024-11-21' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: afec716d828628dd37928197b2b545e1 + size: 1264402 + path: global_carbon_budget_land_use_change_emissions.xlsx diff --git a/snapshots/gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx.dvc b/snapshots/gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx.dvc new file mode 100644 index 00000000000..ede8c8d039d --- /dev/null +++ b/snapshots/gcp/2024-11-21/global_carbon_budget_national_emissions.xlsx.dvc @@ -0,0 +1,28 @@ +meta: + origin: + producer: Global Carbon Project + title: Global Carbon Budget + description: |- + The Global Carbon Budget was founded by the Global Carbon Project (GCP) international science team to track the trends in global carbon emissions and sinks and is a key measure of progress towards the goals of the Paris Agreement. It's widely recognized as the most comprehensive report of its kind. + + The GCP has been publishing estimates of global and national fossil CO2 emissions since 2001. In the first instance these were simple re-publications of data from another source, but over subsequent years refinements have been made in response to feedback and identification of inaccuracies. + title_snapshot: Global Carbon Budget - National emissions + citation_full: |- + Andrew, R. M., & Peters, G. P. (2024). The Global Carbon Project's fossil CO2 emissions dataset (2024v17) [Data set]. Zenodo. https://doi.org/10.5281/zenodo.13981696 + + The data files of the Global Carbon Budget can be found at: https://globalcarbonbudget.org/carbonbudget/ + + For more details, see the original paper: + Friedlingstein, P., O'Sullivan, M., Jones, M. W., Andrew, R. M., Bakker, D. C. E., Hauck, J., Landschützer, P., Le Quéré, C., Luijkx, I. T., Peters, G. P., Peters, W., Pongratz, J., Schwingshackl, C., Sitch, S., Canadell, J. G., Ciais, P., Jackson, R. B., Alin, S. R., Anthoni, P., Barbero, L., Bates, N. R., Becker, M., Bellouin, N., Decharme, B., Bopp, L., Brasika, I. B. M., Cadule, P., Chamberlain, M. A., Chandra, N., Chau, T.-T.-T., Chevallier, F., Chini, L. P., Cronin, M., Dou, X., Enyo, K., Evans, W., Falk, S., Feely, R. A., Feng, L., Ford, D. J., Gasser, T., Ghattas, J., Gkritzalis, T., Grassi, G., Gregor, L., Gruber, N., Gürses, Ö., Harris, I., Hefner, M., Heinke, J., Houghton, R. A., Hurtt, G. C., Iida, Y., Ilyina, T., Jacobson, A. R., Jain, A., Jarníková, T., Jersild, A., Jiang, F., Jin, Z., Joos, F., Kato, E., Keeling, R. F., Kennedy, D., Klein Goldewijk, K., Knauer, J., Korsbakken, J. I., Körtzinger, A., Lan, X., Lefèvre, N., Li, H., Liu, J., Liu, Z., Ma, L., Marland, G., Mayot, N., McGuire, P. C., McKinley, G. A., Meyer, G., Morgan, E. J., Munro, D. R., Nakaoka, S.-I., Niwa, Y., O'Brien, K. M., Olsen, A., Omar, A. M., Ono, T., Paulsen, M., Pierrot, D., Pocock, K., Poulter, B., Powis, C. M., Rehder, G., Resplandy, L., Robertson, E., Rödenbeck, C., Rosan, T. M., Schwinger, J., Séférian, R., Smallman, T. L., Smith, S. M., Sospedra-Alfonso, R., Sun, Q., Sutton, A. J., Sweeney, C., Takao, S., Tans, P. P., Tian, H., Tilbrook, B., Tsujino, H., Tubiello, F., van der Werf, G. R., van Ooijen, E., Wanninkhof, R., Watanabe, M., Wimart-Rousseau, C., Yang, D., Yang, X., Yuan, W., Yue, X., Zaehle, S., Zeng, J., and Zheng, B.: Global Carbon Budget 2023, Earth Syst. Sci. Data, 15, 5301-5369, https://doi.org/10.5194/essd-15-5301-2023, 2023. + attribution: Global Carbon Budget (2024) + attribution_short: GCB + url_main: https://globalcarbonbudget.org/ + date_accessed: '2024-11-21' + date_published: '2024-11-21' + license: + name: CC BY 4.0 + url: https://www.icos-cp.eu/data-services/about-data-portal/data-license +outs: + - md5: 08d4f1086b4d2f6ec31fc48c6cd96e8e + size: 961527 + path: global_carbon_budget_national_emissions.xlsx